[llvm] 475ce4c - RFC: Uniformity Analysis for Irreducible Control Flow

Mon Dec 19 17:52:48 PST 2022

Author: Sameer Sahasrabuddhe
Date: 2022-12-20T07:22:24+05:30
New Revision: 475ce4c200ca640f1d6ccd097b167a04f009cb18

URL: https://github.com/llvm/llvm-project/commit/475ce4c200ca640f1d6ccd097b167a04f009cb18
DIFF: https://github.com/llvm/llvm-project/commit/475ce4c200ca640f1d6ccd097b167a04f009cb18.diff

LOG: RFC: Uniformity Analysis for Irreducible Control Flow

Uniformity analysis is a generalization of divergence analysis to
include irreducible control flow:

  1. The proposed spec presents a notion of "maximal convergence" that
     captures the existing convention of converging threads at the
     headers of natual loops.

  2. Maximal convergence is then extended to irreducible cycles. The
     identity of irreducible cycles is determined by the choices made
     in a depth-first traversal of the control flow graph. Uniformity
     analysis uses criteria that depend only on closed paths and not
     cycles, to determine maximal convergence. This makes it a
     conservative analysis that is independent of the effect of DFS on
     CycleInfo.

  3. The analysis is implemented as a template that can be
     instantiated for both LLVM IR and Machine IR.

Validation:
  - passes existing tests for divergence analysis
  - passes new tests with irreducible control flow
  - passes equivalent tests in MIR and GMIR

Based on concepts originally outlined by
Nicolai Haehnle <nicolai.haehnle at amd.com>

With contributions from Ruiling Song <ruiling.song at amd.com> and
Jay Foad <jay.foad at amd.com>.

Support for GMIR and lit tests for GMIR/MIR added by
Yashwant Singh <yashwant.singh at amd.com>.

Differential Revision: https://reviews.llvm.org/D130746

Added: 
    llvm/docs/ConvergenceAndUniformity.rst
    llvm/docs/convergence-both-diverged-nested.png
    llvm/docs/convergence-closed-path.png
    llvm/docs/convergence-divergent-inside.png
    llvm/docs/convergence-divergent-outside.png
    llvm/docs/convergence-natural-loop.png
    llvm/include/llvm/ADT/GenericUniformityImpl.h
    llvm/include/llvm/ADT/GenericUniformityInfo.h
    llvm/include/llvm/ADT/Uniformity.h
    llvm/include/llvm/Analysis/UniformityAnalysis.h
    llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
    llvm/lib/Analysis/UniformityAnalysis.cpp
    llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/always-uniform-gmir.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/always-uniform.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/atomics-gmir.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/atomics.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/hidden-diverge.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/branch-outside-gmir.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/diverged-entry-basic-gmir.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/exit-divergence-gmir.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/irreducible-2-gmir.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/join-loopexit-gmir.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/loads-gmir.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/never-uniform.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/branch-outside.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/diverged-entry-basic.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/exit-divergence.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-1.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/reducible-headers.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/join-at-loop-heart.ll

Modified: 
    llvm/docs/CycleTerminology.rst
    llvm/docs/Reference.rst
    llvm/include/llvm/ADT/GenericCycleInfo.h
    llvm/include/llvm/CodeGen/MachineCycleAnalysis.h
    llvm/include/llvm/CodeGen/MachinePassRegistry.def
    llvm/include/llvm/CodeGen/MachineSSAContext.h
    llvm/include/llvm/CodeGen/TargetInstrInfo.h
    llvm/include/llvm/IR/SSAContext.h
    llvm/include/llvm/InitializePasses.h
    llvm/lib/Analysis/CMakeLists.txt
    llvm/lib/CodeGen/CMakeLists.txt
    llvm/lib/CodeGen/CodeGen.cpp
    llvm/lib/CodeGen/MachineCycleAnalysis.cpp
    llvm/lib/CodeGen/MachineSSAContext.cpp
    llvm/lib/IR/SSAContext.cpp
    llvm/lib/Passes/PassBuilder.cpp
    llvm/lib/Passes/PassRegistry.def
    llvm/lib/Target/AMDGPU/SIDefines.h
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/b42473-r1-crash.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/control-flow-intrinsics.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/interp_f16.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/join-at-loop-exit.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/propagate-loop-live-out.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/trivial-join-at-loop-exit.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
    llvm/test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll
    llvm/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll
    llvm/test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll
    llvm/test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll

Removed: 
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll


################################################################################
diff  --git a/llvm/docs/ConvergenceAndUniformity.rst b/llvm/docs/ConvergenceAndUniformity.rst
new file mode 100644
index 0000000000000..c098ef8635257

--- /dev/null
+++ b/llvm/docs/ConvergenceAndUniformity.rst
@@ -0,0 +1,697 @@
+==========================
+Convergence And Uniformity
+==========================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+Some parallel environments execute threads in groups that allow
+communication within the group using special primitives called
+*convergent* operations. The outcome of a convergent operation is
+sensitive to the set of threads that executes it "together", i.e.,
+convergently.
+
+A value is said to be *uniform* across a set of threads if it is the
+same across those threads, and *divergent* otherwise. Correspondingly,
+a branch is said to be a uniform branch if its condition is uniform,
+and it is a divergent branch otherwise.
+
+Whether threads are *converged* or not depends on the paths they take
+through the control flow graph. Threads take 
diff erent outgoing edges
+at a *divergent branch*. Divergent branches constrain
+program transforms such as changing the CFG or moving a convergent
+operation to a 
diff erent point of the CFG. Performing these
+transformations across a divergent branch can change the sets of
+threads that execute convergent operations convergently. While these
+constraints are out of scope for this document, the described
+*uniformity analysis* allows these transformations to identify
+uniform branches where these constraints do not hold.
+
+Convergence and
+uniformity are inter-dependent: When threads diverge at a divergent
+branch, they may later *reconverge* at a common program point.
+Subsequent operations are performed convergently, but the inputs may
+be non-uniform, thus producing divergent outputs.
+
+Uniformity is also useful by itself on targets that execute threads in
+groups with shared execution resources (e.g. waves, warps, or
+subgroups):
+
+- Uniform outputs can potentially be computed or stored on shared
+  resources.
+- These targets must "linearize" a divergent branch to ensure that
+  each side of the branch is followed by the corresponding threads in
+  the same group. But linearization is unnecessary at uniform
+  branches, since the whole group of threads follows either one side
+  of the branch or the other.
+
+This document presents a definition of convergence that is reasonable
+for real targets and is compatible with the currently implicit
+semantics of convergent operations in LLVM IR. This is accompanied by
+a *uniformity analysis* that extends the existing divergence analysis
+[DivergenceSPMD]_ to cover irreducible control-flow.
+
+.. [DivergenceSPMD] Julian Rosemann, Simon Moll, and Sebastian
+   Hack. 2021. An Abstract Interpretation for SPMD Divergence on
+   Reducible Control Flow Graphs. Proc. ACM Program. Lang. 5, POPL,
+   Article 31 (January 2021), 35 pages.
+   https://doi.org/10.1145/3434312
+
+Terminology
+===========
+
+Cycles
+   Described in :ref:`cycle-terminology`.
+
+Closed path
+   Described in :ref:`cycle-closed-path`.
+
+Disjoint paths
+   Two paths in a CFG are said to be disjoint if the only nodes common
+   to both are the start node or the end node, or both.
+
+Join node
+   A join node of a branch is a node reachable along disjoint paths
+   starting from that branch.
+
+Diverged path
+   A diverged path is a path that starts from a divergent branch and
+   either reaches a join node of the branch or reaches the end of the
+   function without passing through any join node of the branch.
+
+Threads and Dynamic Instances
+=============================
+
+Each occurrence of an instruction in the program source is called a
+*static instance*. When a thread executes a program, each execution of
+a static instance produces a distinct *dynamic instance* of that
+instruction.
+
+Each thread produces a unique sequence of dynamic instances:
+
+- The sequence is generated along branch decisions and loop
+  traversals.
+- Starts with a dynamic instance of a "first" instruction.
+- Continues with dynamic instances of successive "next"
+  instructions.
+
+Threads are independent; some targets may choose to execute them in
+groups in order to share resources when possible.
+
+.. figure:: convergence-natural-loop.png
+   :name: convergence-natural-loop
+
+.. table::
+   :name: convergence-thread-example
+   :align: left
+
+   +----------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+   |          |        | 1   | 2   | 3   | 4   | 5   | 6   | 7   | 8   | 9   |      |
+   +----------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+   | Thread 1 | Entry1 | H1  | B1  | L1  | H3  |     | L3  |     |     |     | Exit |
+   +----------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+   | Thread 2 | Entry1 | H2  |     | L2  | H4  | B2  | L4  | H5  | B3  | L5  | Exit |
+   +----------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+
+In the above table, each row is a 
diff erent thread, listing the
+dynamic instances produced by that thread from left to right. Each
+thread executes the same program that starts with an ``Entry`` node
+and ends with an ``Exit`` node, but 
diff erent threads may take
+
diff erent paths through the control flow of the program. The columns
+are numbered merely for convenience, and empty cells have no special
+meaning. Dynamic instances listed in the same column are converged.
+
+.. _convergence-definition:
+
+Convergence
+===========
+
+*Converged-with* is a transitive symmetric relation over dynamic
+instances produced by *
diff erent threads* for the *same static
+instance*. Informally, two threads that produce converged dynamic
+instances are said to be *converged*, and they are said to execute
+that static instance *convergently*, at that point in the execution.
+
+*Convergence order* is a strict partial order over dynamic instances
+that is defined as the transitive closure of:
+
+1. If dynamic instance ``P`` is executed strictly before ``Q`` in the
+   same thread, then ``P`` is *convergence-before* ``Q``.
+2. If dynamic instance ``P`` is executed strictly before ``Q1`` in the
+   same thread, and ``Q1`` is *converged-with* ``Q2``, then ``P`` is
+   *convergence-before* ``Q2``.
+3. If dynamic instance ``P1`` is *converged-with* ``P2``, and ``P2``
+   is executed strictly before ``Q`` in the same thread, then ``P1``
+   is *convergence-before* ``Q``.
+
+.. table::
+   :name: convergence-order-example
+   :align: left
+
+   +----------+-------+-----+-----+-----+-----+-----+-----+-----+------+
+   |          | 1     | 2   | 3   | 4   | 5   | 6   | 7   | 8   | 9    |
+   +----------+-------+-----+-----+-----+-----+-----+-----+-----+------+
+   | Thread 1 | Entry | ... |     |     |     | S2  | T   | ... | Exit |
+   +----------+-------+-----+-----+-----+-----+-----+-----+-----+------+
+   | Thread 2 | Entry | ... |     | Q2  | R   | S1  |     | ... | Exit |
+   +----------+-------+-----+-----+-----+-----+-----+-----+-----+------+
+   | Thread 3 | Entry | ... | P   | Q1  |     |     |     | ... |      |
+   +----------+-------+-----+-----+-----+-----+-----+-----+-----+------+
+
+The above table shows partial sequences of dynamic instances from
+
diff erent threads. Dynamic instances in the same column are assumed
+to be converged (i.e., related to each other in the converged-with
+relation). The resulting convergence order includes the edges ``P ->
+Q2``, ``Q1 -> R``, ``P -> R``, ``P -> T``, etc.
+
+The fact that *convergence-before* is a strict partial order is a
+constraint on the *converged-with* relation. It is trivially satisfied
+if 
diff erent dynamic instances are never converged. It is also
+trivially satisfied for all known implementations for which
+convergence plays some role. Aside from the strict partial convergence
+order, there are currently no additional constraints on the
+*converged-with* relation imposed in LLVM IR.
+
+.. _convergence-note-convergence:
+
+.. note::
+
+   1. The ``convergent`` attribute on convergent operations does
+      constrain changes to ``converged-with``, but it is expressed in
+      terms of control flow and does not explicitly deal with thread
+      convergence.
+
+   2. The convergence-before relation is not
+      directly observable. Program transforms are in general free to
+      change the order of instructions, even though that obviously
+      changes the convergence-before relation.
+
+   3. Converged dynamic instances need not be executed at the same
+      time or even on the same resource. Converged dynamic instances
+      of a convergent operation may appear to do so but that is an
+      implementation detail. The fact that ``P`` is convergence-before
+      ``Q`` does not automatically imply that ``P`` happens-before
+      ``Q`` in a memory model sense.
+
+   4. **Future work:** Providing convergence-related guarantees to
+      compiler frontends enables some powerful optimization techniques
+      that can be used by programmers or by high-level program
+      transforms. Constraints on the ``converged-with`` relation may
+      be added eventually as part of the definition of LLVM
+      IR, so that guarantees can be made that frontends can rely on.
+      For a proposal on how this might work, see `D85603
+      <https://reviews.llvm.org/D85603>`_.
+
+.. _convergence-maximal:
+
+Maximal Convergence
+-------------------
+
+This section defines a constraint that may be used to
+produce a *maximal converged-with* relation without violating the
+strict *convergence-before* order. This maximal converged-with
+relation is reasonable for real targets and is compatible with
+convergent operations.
+
+The maximal converged-with relation is defined in terms of cycle
+headers, which are not unique to a given CFG. Each cycle hierarchy for
+the same CFG results in a 
diff erent maximal converged-with relation.
+
+   **Maximal converged-with:**
+
+   Dynamic instances ``X1`` and ``X2`` produced by 
diff erent threads
+   for the same static instance ``X`` are converged in the maximal
+   converged-with relation if and only if for every cycle ``C`` with
+   header ``H`` that contains ``X``:
+
+   - every dynamic instance ``H1`` of ``H`` that precedes ``X1`` in
+     the respective thread is convergence-before ``X2``, and,
+   - every dynamic instance ``H2`` of ``H`` that precedes ``X2`` in
+     the respective thread is convergence-before ``X1``,
+   - without assuming that ``X1`` is converged with ``X2``.
+
+.. note::
+
+   For brevity, the rest of the document restricts the term
+   *converged* to mean "related under the maximal converged-with
+   relation for the given cycle hierarchy".
+
+Maximal convergence can now be demonstrated in the earlier example as follows:
+
+.. table::
+   :align: left
+
+   +----------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+   |          |        | 1   | 2   | 3   | 4   | 5   | 6   | 7   | 8   | 9   |      |
+   +----------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+   | Thread 1 | Entry1 | H1  | B1  | L1  | H3  |     | L3  |     |     |     | Exit |
+   +----------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+   | Thread 2 | Entry2 | H2  |     | L2  | H4  | B2  | L4  | H5  | B3  | L5  | Exit |
+   +----------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+
+- ``Entry1`` and ``Entry2`` are converged.
+- ``H1`` and ``H2`` are converged.
+- ``B1`` and ``B2`` are not converged due to ``H4`` which is not
+  convergence-before ``B1``.
+- ``H3`` and ``H4`` are converged.
+- ``H3`` is not converged with ``H5`` due to ``H4`` which is not
+  convergence-before ``H3``.
+- ``L1`` and ``L2`` are converged.
+- ``L3`` and ``L4`` are converged.
+- ``L3`` is not converged with ``L5`` due to ``H5`` which is not
+  convergence-before ``L3``.
+
+.. _convergence-cycle-headers:
+
+Dependence on Cycles Headers
+----------------------------
+
+Contradictions in convergence order are possible only between two
+nodes that are inside some cycle. The dynamic instances of such nodes
+may be interleaved in the same thread, and this interleaving may be
+
diff erent for 
diff erent threads.
+
+When a thread executes a node ``X`` once and then executes it again,
+it must have followed a closed path in the CFG that includes ``X``.
+Such a path must pass through the header of at least one cycle --- the
+smallest cycle that includes the entire closed path. In a given
+thread, two dynamic instances of ``X`` are either separated by the
+execution of at least one cycle header, or ``X`` itself is a cycle
+header.
+
+In reducible cycles (natural loops), each execution of the header is
+equivalent to the start of a new iteration of the cycle. But this
+analogy breaks down in the presence of explicit constraints on the
+converged-with relation, such as those described in :ref:`future
+work<convergence-note-convergence>`. Instead, cycle headers should be
+treated as implicit *points of convergence* in a maximal
+converged-with relation.
+
+Consider a sequence of nested cycles ``C1``, ``C2``, ..., ``Ck`` such
+that ``C1`` is the outermost cycle and ``Ck`` is the innermost cycle,
+with headers ``H1``, ``H2``, ..., ``Hk`` respectively. When a thread
+enters the cycle ``Ck``, any of the following is possible:
+
+1. The thread directly entered cycle ``Ck`` without having executed
+   any of the headers ``H1`` to ``Hk``.
+
+2. The thread executed some or all of the nested headers one or more
+   times.
+
+The maximal converged-with relation captures the following intuition
+about cycles:
+
+1. When two threads enter a top-level cycle ``C1``, they execute
+   converged dynamic instances of every node that is a :ref:`child
+   <cycle-parent-block>` of ``C1``.
+
+2. When two threads enter a nested cycle ``Ck``, they execute
+   converged dynamic instances of every node that is a child of
+   ``Ck``, until either thread exits ``Ck``, if and only if they
+   executed converged dynamic instances of the last nested header that
+   either thread encountered.
+
+   Note that when a thread exits a nested cycle ``Ck``, it must follow
+   a closed path outside ``Ck`` to reenter it. This requires executing
+   the header of some outer cycle, as described earlier.
+
+Consider two dynamic instances ``X1`` and ``X2`` produced by threads ``T1``
+and ``T2`` for a node ``X`` that is a child of nested cycle ``Ck``.
+Maximal convergence relates ``X1`` and ``X2`` as follows:
+
+1. If neither thread executed any header from ``H1`` to ``Hk``, then
+   ``X1`` and ``X2`` are converged.
+
+2. Otherwise, if there are no converged dynamic instances ``Q1`` and
+   ``Q2`` of any header ``Q`` from ``H1`` to ``Hk`` (where ``Q`` is
+   possibly the same as ``X``), such that ``Q1`` precedes ``X1`` and
+   ``Q2`` precedes ``X2`` in the respective threads, then ``X1`` and
+   ``X2`` are not converged.
+
+3. Otherwise, consider the pair ``Q1`` and ``Q2`` of converged dynamic
+   instances of a header ``Q`` from ``H1`` to ``Hk`` that occur most
+   recently before ``X1`` and ``X2`` in the respective threads. Then
+   ``X1`` and ``X2`` are converged if and only if there is no dynamic
+   instance of any header from ``H1`` to ``Hk`` that occurs between
+   ``Q1`` and ``X1`` in thread ``T1``, or between ``Q2`` and ``X2`` in
+   thread ``T2``. In other words, ``Q1`` and ``Q2`` represent the last
+   point of convergence, with no other header being executed before
+   executing ``X``.
+
+**Example:**
+
+.. figure:: convergence-both-diverged-nested.png
+   :name: convergence-both-diverged-nested
+
+The above figure shows two nested irreducible cycles with headers
+``R`` and ``S``. The nodes ``Entry`` and ``Q`` have divergent
+branches. The table below shows the convergence between three threads
+taking 
diff erent paths through the CFG. Dynamic instances listed in
+the same column are converged.
+
+   .. table::
+      :align: left
+
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+------+
+      |         | 1     | 2   | 3   | 4   | 5   | 6   | 7   | 8   | 10   |
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+------+
+      | Thread1 | Entry | P1  | Q1  | S1  | P3  | Q3  | R1  | S2  | Exit |
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+------+
+      | Thread2 | Entry | P2  | Q2  |     |     |     | R2  | S3  | Exit |
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+------+
+      | Thread3 | Entry |     |     |     |     |     | R3  | S4  | Exit |
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+------+
+
+- ``P2`` and ``P3`` are not converged due to ``S1``
+- ``Q2`` and ``Q3`` are not converged due to ``S1``
+- ``S1`` and ``S3`` are not converged due to ``R2``
+- ``S1`` and ``S4`` are not converged due to ``R3``
+
+Informally, ``T1`` and ``T2`` execute the inner cycle a 
diff erent
+number of times, without executing the header of the outer cycle. All
+threads converge in the outer cycle when they first execute the header
+of the outer cycle.
+
+.. _convergence-uniformity:
+
+Uniformity
+==========
+
+1. The output of two converged dynamic instances is uniform if and
+   only if it compares equal for those two dynamic instances.
+2. The output of a static instance ``X`` is uniform *for a given set
+   of threads* if and only if it is uniform for every pair of
+   converged dynamic instances of ``X`` produced by those threads.
+
+A non-uniform value is said to be *divergent*.
+
+For a set ``S`` of threads, the uniformity of each output of a static
+instance is determined as follows:
+
+1. The semantics of the instruction may specify the output to be
+   uniform.
+2. Otherwise, if it is a PHI node, its output is uniform if and only
+   if for every pair of converged dynamic instances produced by all
+   threads in ``S``:
+
+   a. Both instances choose the same output from converged
+      dynamic instances, and,
+   b. That output is uniform for all threads in ``S``.
+3. Otherwise, the output is uniform if and only if the input
+   operands are uniform for all threads in ``S``.
+
+Divergent Cycle Exits
+---------------------
+
+When a divergent branch occurs inside a cycle, it is possible that a
+diverged path continues to an exit of the cycle. This is called a
+divergent cycle exit. If the cycle is irreducible, the diverged path
+may re-enter and eventually reach a join within the cycle. Such a join
+should be examined for the :ref:`diverged entry
+<convergence-diverged-entry>` criterion.
+
+Nodes along the diverged path that lie outside the cycle experience
+*temporal divergence*, when two threads executing convergently inside
+the cycle produce uniform values, but exit the cycle along the same
+divergent path after executing the header a 
diff erent number of times
+(informally, on 
diff erent iterations of the cycle). For a node ``N``
+inside the cycle the outputs may be uniform for the two threads, but
+any use ``U`` outside the cycle receives a value from non-converged
+dynamic instances of ``N``. An output of ``U`` may be divergent,
+depending on the semantics of the instruction.
+
+Static Uniformity Analysis
+==========================
+
+Irreducible control flow results in 
diff erent cycle hierarchies
+depending on the choice of headers during depth-first traversal. As a
+result, a static analysis cannot always determine the convergence of
+nodes in irreducible cycles, and any uniformity analysis is limited to
+those static instances whose convergence is independent of the cycle
+hierarchy:
+
+  **m-converged static instances:**
+
+  A static instance ``X`` is *m-converged* for a given CFG if and only
+  if the maximal converged-with relation for its dynamic instances is
+  the same in every cycle hierarchy that can be constructed for that CFG.
+
+  .. note::
+
+   In other words, two dynamic instances ``X1`` and ``X2`` of an
+   m-converged static instance ``X`` are converged in some cycle
+   hierarchy if and only if they are also converged in every other
+   cycle hierarchy for the same CFG.
+
+   As noted earlier, for brevity, we restrict the term *converged* to
+   mean "related under the maximal converged-with relation for a given
+   cycle hierarchy".
+
+
+Each node ``X`` in a given CFG is reported to be m-converged if and
+only if:
+
+1. ``X`` is a :ref:`top-level<cycle-toplevel-block>` node, in which
+   case, there are no cycle headers to influence the convergence of
+   ``X``.
+
+2. Otherwise, if ``X`` is inside a cycle, then every cycle that
+   contains ``X`` satisfies the following necessary conditions:
+
+   a. Every divergent branch inside the cycle satisfies the
+      :ref:`diverged entry criterion<convergence-diverged-entry>`, and,
+   b. There are no :ref:`diverged paths reaching the
+      cycle<convergence-diverged-outside>` from a divergent branch
+      outside it.
+
+.. note::
+
+   A reducible cycle :ref:`trivially satisfies
+   <convergence-reducible-cycle>` the above conditions. In particular,
+   if the whole CFG is reducible, then all nodes in the CFG are
+   m-converged.
+
+If a static instance is not m-converged, then every output is assumed
+to be divergent. Otherwise, for an m-converged static instance, the
+uniformity of each output is determined using the criteria
+:ref:`described earlier <convergence-uniformity>`. The discovery of
+divergent outputs may cause their uses (including branches) to also
+become divergent. The analysis propagates this divergence until a
+fixed point is reached.
+
+The convergence inferred using these criteria is a safe subset of the
+maximal converged-with relation for any cycle hierarchy. In
+particular, it is sufficient to determine if a static instance is
+m-converged for a given cycle hierarchy ``T``, even if that fact is
+not detected when examining some other cycle hierarchy ``T'``.
+
+This property allows compiler transforms to use the uniformity
+analysis without being affected by DFS choices made in the underlying
+cycle analysis. When two transforms use 
diff erent instances of the
+uniformity analysis for the same CFG, a "divergent value" result in
+one analysis instance cannot contradict a "uniform value" result in
+the other.
+
+Generic transforms such as SimplifyCFG, CSE, and loop transforms
+commonly change the program in ways that change the maximal
+converged-with relations. This also means that a value that was
+previously uniform can become divergent after such a transform.
+Uniformity has to be recomputed after such transforms.
+
+Divergent Branch inside a Cycle
+-------------------------------
+
+.. figure:: convergence-divergent-inside.png
+   :name: convergence-divergent-inside
+
+The above figure shows a divergent branch ``Q`` inside an irreducible
+cyclic region. When two threads diverge at ``Q``, the convergence of
+dynamic instances within the cyclic region depends on the cycle
+hierarchy chosen:
+
+1. In an implementation that detects a single cycle ``C`` with header
+   ``P``, convergence inside the cycle is determined by ``P``.
+
+2. In an implementation that detects two nested cycles with headers
+   ``R`` and ``S``, convergence inside those cycles is determined by
+   their respective headers.
+
+.. _convergence-diverged-entry:
+
+A conservative approach would be to simply report all nodes inside
+irreducible cycles as having divergent outputs. But it is desirable to
+recognize m-converged nodes in the CFG in order to maximize
+uniformity. This section describes one such pattern of nodes derived
+from *closed paths*, which are a property of the CFG and do not depend
+on the cycle hierarchy.
+
+  **Diverged Entry Criterion:**
+
+  The dynamic instances of all the nodes in a closed path ``P`` are
+  m-converged only if for every divergent branch ``B`` and its
+  join node ``J`` that lie on ``P``, there is no entry to ``P`` which
+  lies on a diverged path from ``B`` to ``J``.
+
+.. figure:: convergence-closed-path.png
+   :name: convergence-closed-path
+
+Consider the closed path ``P -> Q -> R -> S`` in the above figure.
+``P`` and ``R`` are :ref:`entries to the closed
+path<cycle-closed-path>`. ``Q`` is a divergent branch and ``S`` is a
+join for that branch, with diverged paths ``Q -> R -> S`` and ``Q ->
+S``.
+
+- If a diverged entry ``R`` exists, then in some cycle hierarchy,
+  ``R`` is the header of the smallest cycle ``C`` containing the
+  closed path and a :ref:`child cycle<cycle-definition>` ``C'``
+  exists in the set ``C - R``, containing both branch ``Q`` and join
+  ``S``. When threads diverge at ``Q``, one subset ``M`` continues
+  inside cycle ``C'``, while the complement ``N`` exits ``C'`` and
+  reaches ``R``. Dynamic instances of ``S`` executed by threads in set
+  ``M`` are not converged with those executed in set ``N`` due to the
+  presence of ``R``. Informally, threads that diverge at ``Q``
+  reconverge in the same iteration of the outer cycle ``C``, but they
+  may have executed the inner cycle ``C'`` 
diff erently.
+
+  .. table::
+     :align: left
+
+     +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+     |         | 1     | 2   | 3   | 4   | 5   | 6   | 7   | 8   | 9   | 10  | 11   |
+     +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+     | Thread1 | Entry | P1  | Q1  |     |     |     | R1  | S1  | P3  | ... | Exit |
+     +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+     | Thread2 | Entry | P2  | Q2  | S2  | P4  | Q4  | R2  | S4  |     |     | Exit |
+     +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+
+  In the table above, ``S2`` is not converged with ``S1`` due to ``R1``.
+
+|
+
+- If ``R`` does not exist, or if any node other than ``R`` is the
+  header of ``C``, then no such child cycle ``C'`` is detected.
+  Threads that diverge at ``Q`` execute converged dynamic instances of
+  ``S`` since they do not encounter the cycle header on any path from
+  ``Q`` to ``S``. Informally, threads that diverge at ``Q``
+  reconverge at ``S`` in the same iteration of ``C``.
+
+  .. table::
+     :align: left
+
+     +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+------+
+     |         | 1     | 2   | 3   | 4   | 5   | 6   | 7   | 8   | 9   | 10   |
+     +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+------+
+     | Thread1 | Entry | P1  | Q1  | R1  | S1  | P3  | Q3  | R3  | S3  | Exit |
+     +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+------+
+     | Thread2 | Entry | P2  | Q2  |     | S2  | P4  | Q4  | R2  | S4  | Exit |
+     +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+------+
+
+|
+
+  .. note::
+
+     In general, the cycle ``C`` in the above statements is not
+     expected to be the same cycle for 
diff erent headers. Cycles and
+     their headers are tightly coupled; for 
diff erent headers in the
+     same outermost cycle, the child cycles detected may be 
diff erent.
+     The property relevant to the above examples is that for every
+     closed path, there is a cycle ``C`` that contains the path and
+     whose header is on that path.
+
+The diverged entry criterion must be checked for every closed path
+passing through a divergent branch ``B`` and its join ``J``. Since
+:ref:`every closed path passes through the header of some
+cycle<cycle-closed-path-header>`, this amounts to checking every cycle
+``C`` that contains ``B`` and ``J``. When the header of ``C``
+dominates the join ``J``, there can be no entry to any path from the
+header to ``J``, which includes any diverged path from ``B`` to ``J``.
+This is also true for any closed paths passing through the header of
+an outer cycle that contains ``C``.
+
+Thus, the diverged entry criterion can be conservatively simplified
+as follows:
+
+  For a divergent branch ``B`` and its join node ``J``, the nodes in a
+  cycle ``C`` that contains both ``B`` and ``J`` are m-converged only
+  if:
+
+  - ``B`` strictly dominates ``J``, or,
+  - The header ``H`` of ``C`` strictly dominates ``J``, or,
+  - Recursively, there is cycle ``C'`` inside ``C`` that satisfies the
+    same condition.
+
+When ``J`` is the same as ``H`` or ``B``, the trivial dominance is
+insufficient to make any statement about entries to diverged paths.
+
+.. _convergence-diverged-outside:
+
+Diverged Paths reaching a Cycle
+-------------------------------
+
+.. figure:: convergence-divergent-outside.png
+   :name: convergence-divergent-outside
+
+The figure shows two cycle hierarchies with a divergent branch in
+``Entry`` instead of ``Q``. For two threads that enter the closed path
+``P -> Q -> R -> S`` at ``P`` and ``R`` respectively, the convergence
+of dynamic instances generated along the path depends on whether ``P``
+or ``R`` is the header.
+
+-  Convergence when ``P`` is the header.
+
+   .. table::
+      :align: left
+
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+      |         | 1     | 2   | 3   | 4   | 5   | 6   | 7   | 8   | 9   | 10  | 11  | 12  | 13   |
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+      | Thread1 | Entry |     |     |     | P1  | Q1  | R1  | S1  | P3  | Q3  |     | S3  | Exit |
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+      | Thread2 | Entry |     | R2  | S2  | P2  | Q2  |     | S2  | P4  | Q4  | R3  | S4  | Exit |
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+
+   |
+
+-  Convergence when ``R`` is the header.
+
+   .. table::
+      :align: left
+
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+      |         | 1     | 2   | 3   | 4   | 5   | 6   | 7   | 8   | 9   | 10  | 11  | 12   |
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+      | Thread1 | Entry |     | P1  | Q1  | R1  | S1  | P3  | Q3  | S3  |     |     | Exit |
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+      | Thread2 | Entry |     |     |     | R2  | S2  | P2  | Q2  | S2  | P4  | ... | Exit |
+      +---------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
+
+   |
+
+Thus, when diverged paths reach 
diff erent entries of an irreducible
+cycle from outside the cycle, the static analysis conservatively
+reports every node in the cycle as not m-converged.
+
+.. _convergence-reducible-cycle:
+
+Reducible Cycle
+---------------
+
+If ``C`` is a reducible cycle with header ``H``, then in any DFS,
+``H`` :ref:`must be the header of some cycle<cycle-reducible-headers>`
+``C'`` that contains ``C``. Independent of the DFS, there is no entry
+to the subgraph ``C`` other than ``H`` itself. Thus, we have the
+following:
+
+1. The diverged entry criterion is trivially satisfied for a divergent
+   branch and its join, where both are inside subgraph ``C``.
+2. When diverged paths reach the subgraph ``C`` from outside, their
+   convergence is always determined by the same header ``H``.
+
+Clearly, this can be determined only in a cycle hierarchy ``T`` where
+``C`` is detected as a reducible cycle. No such conclusion can be made
+in a 
diff erent cycle hierarchy ``T'`` where ``C`` is part of a larger
+cycle ``C'`` with the same header, but this does not contradict the
+conclusion in ``T``.

diff  --git a/llvm/docs/CycleTerminology.rst b/llvm/docs/CycleTerminology.rst
index 1c11107408d4d..6da180f301817 100644
--- a/llvm/docs/CycleTerminology.rst
+++ b/llvm/docs/CycleTerminology.rst
@@ -7,6 +7,8 @@ LLVM Cycle Terminology
 .. contents::
    :local:
 
+.. _cycle-definition:
+
 Cycles
 ======
 
@@ -53,6 +55,11 @@ A cycle C is said to be the *parent* of a basic block B if B occurs in
 C but not in any child cycle of C. Then B is also said to be a *child*
 of cycle C.
 
+.. _cycle-toplevel-block:
+
+A block B is said to be a *top-level block* if it is not the child of
+any cycle.
+
 .. _cycle-sibling:
 
 A basic block or cycle X is a *sibling* of another basic block or
@@ -194,6 +201,9 @@ A *closed path* in a CFG is a connected sequence of nodes and edges in
 the CFG whose start and end nodes are the same, and whose remaining
 (inner) nodes are distinct.
 
+An *entry* to a closed path ``P`` is a node on ``P`` that is reachable
+from the function entry without passing through any other node on ``P``.
+
 1. If a node D dominates one or more nodes in a closed path P and P
    does not contain D, then D dominates every node in P.
 
@@ -225,3 +235,31 @@ the CFG whose start and end nodes are the same, and whose remaining
    are the same cycle, or one of them is nested inside the other.
    Hence there is always a cycle that contains U1 and U2 but neither
    of D1 and D2.
+
+.. _cycle-closed-path-header:
+
+4. In any cycle hierarchy, the header ``H`` of the smallest cycle
+   ``C`` containing a closed path ``P`` itself lies on ``P``.
+
+   **Proof:** If ``H`` is not in ``P``, then there is a smaller cycle
+   ``C'`` in the set ``C - H`` containing ``P``, thus contradicting
+   the claim that ``C`` is the smallest such cycle.
+
+.. _cycle-reducible-headers:
+
+Reducible Cycle Headers
+=======================
+
+Although the cycle hierarchy depends on the DFS chosen, reducible
+cycles satisfy the following invariant:
+
+  If a reducible cycle ``C`` with header ``H`` is discovered in any
+  DFS, then there exists a cycle ``C'`` in every DFS with header
+  ``H``, that contains ``C``.
+
+**Proof:** For a closed path ``P`` in ``C`` that passes through ``H``,
+every cycle hierarchy has a smallest cycle ``C'`` containing ``P`` and
+whose header is in ``P``. Since ``H`` is the only entry to ``P``,
+``H`` must be the header of ``C'``. Since headers uniquely define
+cycles, ``C'`` contains every such closed path ``P``, and hence ``C'``
+contains ``C``.

diff  --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst
index 9353eb757fdbc..15dabdc4a91c0 100644
--- a/llvm/docs/Reference.rst
+++ b/llvm/docs/Reference.rst
@@ -15,6 +15,7 @@ LLVM and API reference documentation.
    BranchWeightMetadata
    Bugpoint
    CommandGuide/index
+   ConvergenceAndUniformity
    Coroutines
    DependenceGraphs/index
    ExceptionHandling
@@ -219,3 +220,7 @@ Additional Topics
 
 :doc:`YamlIO`
    A reference guide for using LLVM's YAML I/O library.
+
+:doc:`ConvergenceAndUniformity`
+   A description of uniformity analysis in the presence of irreducible
+   control flow, and its implementation.

diff  --git a/llvm/docs/convergence-both-diverged-nested.png b/llvm/docs/convergence-both-diverged-nested.png
new file mode 100755
index 0000000000000..8c9d4b53a40df
Binary files /dev/null and b/llvm/docs/convergence-both-diverged-nested.png 
diff er

diff  --git a/llvm/docs/convergence-closed-path.png b/llvm/docs/convergence-closed-path.png
new file mode 100755
index 0000000000000..96931ae240985
Binary files /dev/null and b/llvm/docs/convergence-closed-path.png 
diff er

diff  --git a/llvm/docs/convergence-divergent-inside.png b/llvm/docs/convergence-divergent-inside.png
new file mode 100755
index 0000000000000..5ec1eedf002b8
Binary files /dev/null and b/llvm/docs/convergence-divergent-inside.png 
diff er

diff  --git a/llvm/docs/convergence-divergent-outside.png b/llvm/docs/convergence-divergent-outside.png
new file mode 100755
index 0000000000000..45139bb578f5a
Binary files /dev/null and b/llvm/docs/convergence-divergent-outside.png 
diff er

diff  --git a/llvm/docs/convergence-natural-loop.png b/llvm/docs/convergence-natural-loop.png
new file mode 100755
index 0000000000000..8d39c1744f76d
Binary files /dev/null and b/llvm/docs/convergence-natural-loop.png 
diff er

diff  --git a/llvm/include/llvm/ADT/GenericCycleInfo.h b/llvm/include/llvm/ADT/GenericCycleInfo.h
index 1b987d8d3ac6b..63db9eb9a6013 100644
--- a/llvm/include/llvm/ADT/GenericCycleInfo.h
+++ b/llvm/include/llvm/ADT/GenericCycleInfo.h
@@ -105,7 +105,9 @@ template <typename ContextT> class GenericCycle {
   }
 
   /// \brief Return whether \p Block is an entry block of the cycle.
-  bool isEntry(BlockT *Block) const { return is_contained(Entries, Block); }
+  bool isEntry(const BlockT *Block) const {
+    return is_contained(Entries, Block);
+  }
 
   /// \brief Return whether \p Block is contained in the cycle.
   bool contains(const BlockT *Block) const {

diff  --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
new file mode 100644
index 0000000000000..2fa5058cf0c8d
--- /dev/null
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -0,0 +1,1391 @@
+//===- GenericUniformAnalysis.cpp --------------------*- C++ -*------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This template implementation resides in a separate file so that it
+// does not get injected into every .cpp file that includes the
+// generic header.
+//
+// DO NOT INCLUDE THIS FILE WHEN MERELY USING UNIFORMITYINFO.
+//
+// This file should only be included by files that implement a
+// specialization of the relvant templates. Currently these are:
+// - UniformityAnalysis.cpp
+//
+// Note: The DEBUG_TYPE macro should be defined before using this
+// file so that any use of LLVM_DEBUG is associated with the
+// including file rather than this file.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Implementation of uniformity analysis.
+///
+/// The algorithm is a fixed point iteration that starts with the assumption
+/// that all control flow and all values are uniform. Starting from sources of
+/// divergence (whose discovery must be implemented by a CFG- or even
+/// target-specific derived class), divergence of values is propagated from
+/// definition to uses in a straight-forward way. The main complexity lies in
+/// the propagation of the impact of divergent control flow on the divergence of
+/// values (sync dependencies).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_GENERICUNIFORMITYIMPL_H
+#define LLVM_ADT_GENERICUNIFORMITYIMPL_H
+
+#include "llvm/ADT/GenericUniformityInfo.h"
+
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "uniformity"
+
+using namespace llvm;
+
+namespace llvm {
+
+template <typename Range> auto unique(Range &&R) {
+  return std::unique(adl_begin(R), adl_end(R));
+}
+
+/// Construct a specially modified post-order traversal of cycles.
+///
+/// The ModifiedPO is contructed using a virtually modified CFG as follows:
+///
+/// 1. The successors of pre-entry nodes (predecessors of an cycle
+///    entry that are outside the cycle) are replaced by the
+///    successors of the successors of the header.
+/// 2. Successors of the cycle header are replaced by the exit blocks
+///    of the cycle.
+///
+/// Effectively, we produce a depth-first numbering with the following
+/// properties:
+///
+/// 1. Nodes after a cycle are numbered earlier than the cycle header.
+/// 2. The header is numbered earlier than the nodes in the cycle.
+/// 3. The numbering of the nodes within the cycle forms an interval
+///    starting with the header.
+///
+/// Effectively, the virtual modification arranges the nodes in a
+/// cycle as a DAG with the header as the sole leaf, and successors of
+/// the header as the roots. A reverse traversal of this numbering has
+/// the following invariant on the unmodified original CFG:
+///
+///    Each node is visited after all its predecessors, except if that
+///    predecessor is the cycle header.
+///
+template <typename ContextT> class ModifiedPostOrder {
+public:
+  using BlockT = typename ContextT::BlockT;
+  using FunctionT = typename ContextT::FunctionT;
+  using DominatorTreeT = typename ContextT::DominatorTreeT;
+
+  using CycleInfoT = GenericCycleInfo<ContextT>;
+  using CycleT = typename CycleInfoT::CycleT;
+  using const_iterator = typename std::vector<BlockT *>::const_iterator;
+
+  ModifiedPostOrder(const ContextT &C) : Context(C) {}
+
+  bool empty() const { return m_order.empty(); }
+  size_t size() const { return m_order.size(); }
+
+  void clear() { m_order.clear(); }
+  void compute(const CycleInfoT &CI);
+
+  unsigned count(BlockT *BB) const { return POIndex.count(BB); }
+  const BlockT *operator[](size_t idx) const { return m_order[idx]; }
+
+  void appendBlock(const BlockT &BB, bool isReducibleCycleHeader = false) {
+    POIndex[&BB] = m_order.size();
+    m_order.push_back(&BB);
+    LLVM_DEBUG(dbgs() << "ModifiedPO(" << POIndex[&BB]
+                      << "): " << Context.print(&BB) << "\n");
+    if (isReducibleCycleHeader)
+      ReducibleCycleHeaders.insert(&BB);
+  }
+
+  unsigned getIndex(const BlockT *BB) const {
+    assert(POIndex.count(BB));
+    return POIndex.lookup(BB);
+  }
+
+  bool isReducibleCycleHeader(const BlockT *BB) const {
+    return ReducibleCycleHeaders.contains(BB);
+  }
+
+private:
+  SmallVector<const BlockT *> m_order;
+  DenseMap<const BlockT *, unsigned> POIndex;
+  SmallPtrSet<const BlockT *, 32> ReducibleCycleHeaders;
+  const ContextT &Context;
+
+  void computeCyclePO(const CycleInfoT &CI, const CycleT *Cycle,
+                      SmallPtrSetImpl<BlockT *> &Finalized);
+
+  void computeStackPO(SmallVectorImpl<BlockT *> &Stack, const CycleInfoT &CI,
+                      const CycleT *Cycle,
+                      SmallPtrSetImpl<BlockT *> &Finalized);
+};
+
+template <typename> class DivergencePropagator;
+
+/// \class GenericSyncDependenceAnalysis
+///
+/// \brief Locate join blocks for disjoint paths starting at a divergent branch.
+///
+/// An analysis per divergent branch that returns the set of basic
+/// blocks whose phi nodes become divergent due to divergent control.
+/// These are the blocks that are reachable by two disjoint paths from
+/// the branch, or cycle exits reachable along a path that is disjoint
+/// from a path to the cycle latch.
+
+// --- Above line is not a doxygen comment; intentionally left blank ---
+//
+// Originally implemented in SyncDependenceAnalysis.cpp for DivergenceAnalysis.
+//
+// The SyncDependenceAnalysis is used in the UniformityAnalysis to model
+// control-induced divergence in phi nodes.
+//
+// -- Reference --
+// The algorithm is an extension of Section 5 of
+//
+//   An abstract interpretation for SPMD divergence
+//       on reducible control flow graphs.
+//   Julian Rosemann, Simon Moll and Sebastian Hack
+//   POPL '21
+//
+//
+// -- Sync dependence --
+// Sync dependence characterizes the control flow aspect of the
+// propagation of branch divergence. For example,
+//
+//   %cond = icmp slt i32 %tid, 10
+//   br i1 %cond, label %then, label %else
+// then:
+//   br label %merge
+// else:
+//   br label %merge
+// merge:
+//   %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+//
+// -- Reduction to SSA construction --
+// There are two disjoint paths from A to X, if a certain variant of SSA
+// construction places a phi node in X under the following set-up scheme.
+//
+// This variant of SSA construction ignores incoming undef values.
+// That is paths from the entry without a definition do not result in
+// phi nodes.
+//
+//       entry
+//     /      \
+//    A        \
+//  /   \       Y
+// B     C     /
+//  \   /  \  /
+//    D     E
+//     \   /
+//       F
+//
+// Assume that A contains a divergent branch. We are interested
+// in the set of all blocks where each block is reachable from A
+// via two disjoint paths. This would be the set {D, F} in this
+// case.
+// To generally reduce this query to SSA construction we introduce
+// a virtual variable x and assign to x 
diff erent values in each
+// successor block of A.
+//
+//           entry
+//         /      \
+//        A        \
+//      /   \       Y
+// x = 0   x = 1   /
+//      \  /   \  /
+//        D     E
+//         \   /
+//           F
+//
+// Our flavor of SSA construction for x will construct the following
+//
+//            entry
+//          /      \
+//         A        \
+//       /   \       Y
+// x0 = 0   x1 = 1  /
+//       \   /   \ /
+//     x2 = phi   E
+//         \     /
+//         x3 = phi
+//
+// The blocks D and F contain phi nodes and are thus each reachable
+// by two disjoins paths from A.
+//
+// -- Remarks --
+// * In case of cycle exits we need to check for temporal divergence.
+//   To this end, we check whether the definition of x 
diff ers between the
+//   cycle exit and the cycle header (_after_ SSA construction).
+//
+// * In the presence of irreducible control flow, the fixed point is
+//   reached only after multiple iterations. This is because labels
+//   reaching the header of a cycle must be repropagated through the
+//   cycle. This is true even in a reducible cycle, since the labels
+//   may have been produced by a nested irreducible cycle.
+//
+// * Note that SyncDependenceAnalysis is not concerned with the points
+//   of convergence in an irreducible cycle. It's only purpose is to
+//   identify join blocks. The "diverged entry" criterion is
+//   separately applied on join blocks to determine if an entire
+//   irreducible cycle is assumed to be divergent.
+//
+// * Relevant related work:
+//     A simple algorithm for global data flow analysis problems.
+//     Matthew S. Hecht and Jeffrey D. Ullman.
+//     SIAM Journal on Computing, 4(4):519–532, December 1975.
+//
+template <typename ContextT> class GenericSyncDependenceAnalysis {
+public:
+  using BlockT = typename ContextT::BlockT;
+  using DominatorTreeT = typename ContextT::DominatorTreeT;
+  using FunctionT = typename ContextT::FunctionT;
+  using ValueRefT = typename ContextT::ValueRefT;
+  using InstructionT = typename ContextT::InstructionT;
+
+  using CycleInfoT = GenericCycleInfo<ContextT>;
+  using CycleT = typename CycleInfoT::CycleT;
+
+  using ConstBlockSet = SmallPtrSet<const BlockT *, 4>;
+  using ModifiedPO = ModifiedPostOrder<ContextT>;
+
+  // * if BlockLabels[B] == C then C is the dominating definition at
+  //   block B
+  // * if BlockLabels[B] == nullptr then we haven't seen B yet
+  // * if BlockLabels[B] == B then:
+  //   - B is a join point of disjoint paths from X, or,
+  //   - B is an immediate successor of X (initial value), or,
+  //   - B is X
+  using BlockLabelMap = DenseMap<const BlockT *, const BlockT *>;
+
+  /// Information discovered by the sync dependence analysis for each
+  /// divergent branch.
+  struct DivergenceDescriptor {
+    // Join points of diverged paths.
+    ConstBlockSet JoinDivBlocks;
+    // Divergent cycle exits
+    ConstBlockSet CycleDivBlocks;
+    // Labels assigned to blocks on diverged paths.
+    BlockLabelMap BlockLabels;
+  };
+
+  using DivergencePropagatorT = DivergencePropagator<ContextT>;
+
+  GenericSyncDependenceAnalysis(const ContextT &Context,
+                                const DominatorTreeT &DT, const CycleInfoT &CI);
+
+  /// \brief Computes divergent join points and cycle exits caused by branch
+  /// divergence in \p Term.
+  ///
+  /// This returns a pair of sets:
+  /// * The set of blocks which are reachable by disjoint paths from
+  ///   \p Term.
+  /// * The set also contains cycle exits if there two disjoint paths:
+  ///   one from \p Term to the cycle exit and another from \p Term to
+  ///   the cycle header.
+  const DivergenceDescriptor &getJoinBlocks(const BlockT *DivTermBlock);
+
+private:
+  static DivergenceDescriptor EmptyDivergenceDesc;
+
+  ModifiedPO CyclePO;
+
+  const DominatorTreeT &DT;
+  const CycleInfoT &CI;
+
+  DenseMap<const BlockT *, std::unique_ptr<DivergenceDescriptor>>
+      CachedControlDivDescs;
+};
+
+/// \brief Analysis that identifies uniform values in a data-parallel
+/// execution.
+///
+/// This analysis propagates divergence in a data-parallel context
+/// from sources of divergence to all users. It can be instantiated
+/// for an IR that provides a suitable SSAContext.
+template <typename ContextT> class GenericUniformityAnalysisImpl {
+public:
+  using BlockT = typename ContextT::BlockT;
+  using FunctionT = typename ContextT::FunctionT;
+  using ValueRefT = typename ContextT::ValueRefT;
+  using ConstValueRefT = typename ContextT::ConstValueRefT;
+  using InstructionT = typename ContextT::InstructionT;
+  using DominatorTreeT = typename ContextT::DominatorTreeT;
+
+  using CycleInfoT = GenericCycleInfo<ContextT>;
+  using CycleT = typename CycleInfoT::CycleT;
+
+  using SyncDependenceAnalysisT = GenericSyncDependenceAnalysis<ContextT>;
+  using DivergenceDescriptorT =
+      typename SyncDependenceAnalysisT::DivergenceDescriptor;
+  using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;
+
+  GenericUniformityAnalysisImpl(const FunctionT &F, const DominatorTreeT &DT,
+                                const CycleInfoT &CI,
+                                const TargetTransformInfo *TTI)
+      : Context(CI.getSSAContext()), F(F), CI(CI), TTI(TTI), DT(DT),
+        SDA(Context, DT, CI) {}
+
+  void initialize();
+
+  const FunctionT &getFunction() const { return F; }
+
+  /// \brief Mark \p UniVal as a value that is always uniform.
+  void addUniformOverride(const InstructionT &Instr);
+
+  /// \brief Mark \p DivVal as a value that is always divergent.
+  /// \returns Whether the tracked divergence state of \p DivVal changed.
+  bool markDivergent(const InstructionT &I);
+  bool markDivergent(ConstValueRefT DivVal);
+  bool markDefsDivergent(const InstructionT &Instr,
+                         bool AllDefsDivergent = true);
+
+  /// \brief Propagate divergence to all instructions in the region.
+  /// Divergence is seeded by calls to \p markDivergent.
+  void compute();
+
+  /// \brief Whether any value was marked or analyzed to be divergent.
+  bool hasDivergence() const { return !DivergentValues.empty(); }
+
+  /// \brief Whether \p Val will always return a uniform value regardless of its
+  /// operands
+  bool isAlwaysUniform(const InstructionT &Instr) const;
+
+  bool hasDivergentDefs(const InstructionT &I) const;
+
+  bool isDivergent(const InstructionT &I) const {
+    if (I.isTerminator()) {
+      return DivergentTermBlocks.contains(I.getParent());
+    }
+    return hasDivergentDefs(I);
+  };
+
+  /// \brief Whether \p Val is divergent at its definition.
+  bool isDivergent(ConstValueRefT V) const { return DivergentValues.count(V); }
+
+  bool hasDivergentTerminator(const BlockT &B) const {
+    return DivergentTermBlocks.contains(&B);
+  }
+
+  void print(raw_ostream &out) const;
+
+protected:
+  /// \brief Value/block pair representing a single phi input.
+  struct PhiInput {
+    ConstValueRefT value;
+    BlockT *predBlock;
+
+    PhiInput(ConstValueRefT value, BlockT *predBlock)
+        : value(value), predBlock(predBlock) {}
+  };
+
+  const ContextT &Context;
+  const FunctionT &F;
+  const CycleInfoT &CI;
+  const TargetTransformInfo *TTI = nullptr;
+
+  // Detected/marked divergent values.
+  std::set<ConstValueRefT> DivergentValues;
+  SmallPtrSet<const BlockT *, 32> DivergentTermBlocks;
+
+  // Internal worklist for divergence propagation.
+  std::vector<const InstructionT *> Worklist;
+
+  /// \brief Mark \p Term as divergent and push all Instructions that become
+  /// divergent as a result on the worklist.
+  void analyzeControlDivergence(const InstructionT &Term);
+
+private:
+  const DominatorTreeT &DT;
+
+  // Recognized cycles with divergent exits.
+  SmallPtrSet<const CycleT *, 16> DivergentExitCycles;
+
+  // Cycles assumed to be divergent.
+  //
+  // We don't use a set here because every insertion needs an explicit
+  // traversal of all existing members.
+  SmallVector<const CycleT *> AssumedDivergent;
+
+  // The SDA links divergent branches to divergent control-flow joins.
+  SyncDependenceAnalysisT SDA;
+
+  // Set of known-uniform values.
+  SmallPtrSet<const InstructionT *, 32> UniformOverrides;
+
+  /// \brief Mark all nodes in \p JoinBlock as divergent and push them on
+  /// the worklist.
+  void taintAndPushAllDefs(const BlockT &JoinBlock);
+
+  /// \brief Mark all phi nodes in \p JoinBlock as divergent and push them on
+  /// the worklist.
+  void taintAndPushPhiNodes(const BlockT &JoinBlock);
+
+  /// \brief Identify all Instructions that become divergent because \p DivExit
+  /// is a divergent cycle exit of \p DivCycle. Mark those instructions as
+  /// divergent and push them on the worklist.
+  void propagateCycleExitDivergence(const BlockT &DivExit,
+                                    const CycleT &DivCycle);
+
+  /// \brief Internal implementation function for propagateCycleExitDivergence.
+  void analyzeCycleExitDivergence(const CycleT &OuterDivCycle);
+
+  /// \brief Mark all instruction as divergent that use a value defined in \p
+  /// OuterDivCycle. Push their users on the worklist.
+  void analyzeTemporalDivergence(const InstructionT &I,
+                                 const CycleT &OuterDivCycle);
+
+  /// \brief Push all users of \p Val (in the region) to the worklist.
+  void pushUsers(const InstructionT &I);
+  void pushUsers(ConstValueRefT V);
+
+  bool usesValueFromCycle(const InstructionT &I, const CycleT &DefCycle) const;
+
+  /// \brief Whether \p Val is divergent when read in \p ObservingBlock.
+  bool isTemporalDivergent(const BlockT &ObservingBlock,
+                           ConstValueRefT Val) const;
+};
+
+template <typename ContextT>
+void GenericUniformityInfo<ContextT>::ImplDeleter::operator()(
+    GenericUniformityAnalysisImpl<ContextT> *Impl) {
+  delete Impl;
+}
+
+/// Compute divergence starting with a divergent branch.
+template <typename ContextT> class DivergencePropagator {
+public:
+  using BlockT = typename ContextT::BlockT;
+  using DominatorTreeT = typename ContextT::DominatorTreeT;
+  using FunctionT = typename ContextT::FunctionT;
+  using ValueRefT = typename ContextT::ValueRefT;
+
+  using CycleInfoT = GenericCycleInfo<ContextT>;
+  using CycleT = typename CycleInfoT::CycleT;
+
+  using ModifiedPO = ModifiedPostOrder<ContextT>;
+  using SyncDependenceAnalysisT = GenericSyncDependenceAnalysis<ContextT>;
+  using DivergenceDescriptorT =
+      typename SyncDependenceAnalysisT::DivergenceDescriptor;
+  using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;
+
+  const ModifiedPO &CyclePOT;
+  const DominatorTreeT &DT;
+  const CycleInfoT &CI;
+  const BlockT &DivTermBlock;
+  const ContextT &Context;
+
+  // Track blocks that receive a new label. Every time we relabel a
+  // cycle header, we another pass over the modified post-order in
+  // order to propagate the header label. The bit vector also allows
+  // us to skip labels that have not changed.
+  SparseBitVector<> FreshLabels;
+
+  // divergent join and cycle exit descriptor.
+  std::unique_ptr<DivergenceDescriptorT> DivDesc;
+  BlockLabelMapT &BlockLabels;
+
+  DivergencePropagator(const ModifiedPO &CyclePOT, const DominatorTreeT &DT,
+                       const CycleInfoT &CI, const BlockT &DivTermBlock)
+      : CyclePOT(CyclePOT), DT(DT), CI(CI), DivTermBlock(DivTermBlock),
+        Context(CI.getSSAContext()), DivDesc(new DivergenceDescriptorT),
+        BlockLabels(DivDesc->BlockLabels) {}
+
+  void printDefs(raw_ostream &Out) {
+    Out << "Propagator::BlockLabels {\n";
+    for (int BlockIdx = (int)CyclePOT.size() - 1; BlockIdx >= 0; --BlockIdx) {
+      const auto *Block = CyclePOT[BlockIdx];
+      const auto *Label = BlockLabels[Block];
+      Out << Context.print(Block) << "(" << BlockIdx << ") : ";
+      if (!Label) {
+        Out << "<null>\n";
+      } else {
+        Out << Context.print(Label) << "\n";
+      }
+    }
+    Out << "}\n";
+  }
+
+  // Push a definition (\p PushedLabel) to \p SuccBlock and return whether this
+  // causes a divergent join.
+  bool computeJoin(const BlockT &SuccBlock, const BlockT &PushedLabel) {
+    const auto *OldLabel = BlockLabels[&SuccBlock];
+
+    LLVM_DEBUG(dbgs() << "labeling " << Context.print(&SuccBlock) << ":\n"
+                      << "\tpushed label: " << Context.print(&PushedLabel)
+                      << "\n"
+                      << "\told label: " << Context.print(OldLabel) << "\n");
+
+    // Early exit if there is no change in the label.
+    if (OldLabel == &PushedLabel)
+      return false;
+
+    if (OldLabel != &SuccBlock) {
+      auto SuccIdx = CyclePOT.getIndex(&SuccBlock);
+      // Assigning a new label, mark this in FreshLabels.
+      LLVM_DEBUG(dbgs() << "\tfresh label: " << SuccIdx << "\n");
+      FreshLabels.set(SuccIdx);
+    }
+
+    // This is not a join if the succ was previously unlabeled.
+    if (!OldLabel) {
+      LLVM_DEBUG(dbgs() << "\tnew label: " << Context.print(&PushedLabel)
+                        << "\n");
+      BlockLabels[&SuccBlock] = &PushedLabel;
+      return false;
+    }
+
+    // This is a new join. Label the join block as itself, and not as
+    // the pushed label.
+    LLVM_DEBUG(dbgs() << "\tnew label: " << Context.print(&SuccBlock) << "\n");
+    BlockLabels[&SuccBlock] = &SuccBlock;
+
+    return true;
+  }
+
+  // visiting a virtual cycle exit edge from the cycle header --> temporal
+  // divergence on join
+  bool visitCycleExitEdge(const BlockT &ExitBlock, const BlockT &Label) {
+    if (!computeJoin(ExitBlock, Label))
+      return false;
+
+    // Identified a divergent cycle exit
+    DivDesc->CycleDivBlocks.insert(&ExitBlock);
+    LLVM_DEBUG(dbgs() << "\tDivergent cycle exit: " << Context.print(&ExitBlock)
+                      << "\n");
+    return true;
+  }
+
+  // process \p SuccBlock with reaching definition \p Label
+  bool visitEdge(const BlockT &SuccBlock, const BlockT &Label) {
+    if (!computeJoin(SuccBlock, Label))
+      return false;
+
+    // Divergent, disjoint paths join.
+    DivDesc->JoinDivBlocks.insert(&SuccBlock);
+    LLVM_DEBUG(dbgs() << "\tDivergent join: " << Context.print(&SuccBlock)
+                      << "\n");
+    return true;
+  }
+
+  std::unique_ptr<DivergenceDescriptorT> computeJoinPoints() {
+    assert(DivDesc);
+
+    LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints: "
+                      << Context.print(&DivTermBlock) << "\n");
+
+    // Early stopping criterion
+    int FloorIdx = CyclePOT.size() - 1;
+    const BlockT *FloorLabel = nullptr;
+    int DivTermIdx = CyclePOT.getIndex(&DivTermBlock);
+
+    // Bootstrap with branch targets
+    auto const *DivTermCycle = CI.getCycle(&DivTermBlock);
+    for (const auto *SuccBlock : successors(&DivTermBlock)) {
+      if (DivTermCycle && !DivTermCycle->contains(SuccBlock)) {
+        // If DivTerm exits the cycle immediately, computeJoin() might
+        // not reach SuccBlock with a 
diff erent label. We need to
+        // check for this exit now.
+        DivDesc->CycleDivBlocks.insert(SuccBlock);
+        LLVM_DEBUG(dbgs() << "\tImmediate divergent cycle exit: "
+                          << Context.print(SuccBlock) << "\n");
+      }
+      auto SuccIdx = CyclePOT.getIndex(SuccBlock);
+      visitEdge(*SuccBlock, *SuccBlock);
+      FloorIdx = std::min<int>(FloorIdx, SuccIdx);
+    }
+
+    while (true) {
+      auto BlockIdx = FreshLabels.find_last();
+      if (BlockIdx == -1 || BlockIdx < FloorIdx)
+        break;
+
+      LLVM_DEBUG(dbgs() << "Current labels:\n"; printDefs(dbgs()));
+
+      FreshLabels.reset(BlockIdx);
+      if (BlockIdx == DivTermIdx) {
+        LLVM_DEBUG(dbgs() << "Skipping DivTermBlock\n");
+        continue;
+      }
+
+      const auto *Block = CyclePOT[BlockIdx];
+      LLVM_DEBUG(dbgs() << "visiting " << Context.print(Block) << " at index "
+                        << BlockIdx << "\n");
+
+      const auto *Label = BlockLabels[Block];
+      assert(Label);
+
+      bool CausedJoin = false;
+      int LoweredFloorIdx = FloorIdx;
+
+      // If the current block is the header of a reducible cycle that
+      // contains the divergent branch, then the label should be
+      // propagated to the cycle exits. Such a header is the "last
+      // possible join" of any disjoint paths within this cycle. This
+      // prevents detection of spurious joins at the entries of any
+      // irreducible child cycles.
+      //
+      // This conclusion about the header is true for any choice of DFS:
+      //
+      //   If some DFS has a reducible cycle C with header H, then for
+      //   any other DFS, H is the header of a cycle C' that is a
+      //   superset of C. For a divergent branch inside the subgraph
+      //   C, any join node inside C is either H, or some node
+      //   encountered without passing through H.
+      //
+      auto getReducibleParent = [&](const BlockT *Block) -> const CycleT * {
+        if (!CyclePOT.isReducibleCycleHeader(Block))
+          return nullptr;
+        const auto *BlockCycle = CI.getCycle(Block);
+        if (BlockCycle->contains(&DivTermBlock))
+          return BlockCycle;
+        return nullptr;
+      };
+
+      if (const auto *BlockCycle = getReducibleParent(Block)) {
+        SmallVector<BlockT *, 4> BlockCycleExits;
+        BlockCycle->getExitBlocks(BlockCycleExits);
+        for (auto *BlockCycleExit : BlockCycleExits) {
+          CausedJoin |= visitCycleExitEdge(*BlockCycleExit, *Label);
+          LoweredFloorIdx =
+              std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(BlockCycleExit));
+        }
+      } else {
+        for (const auto *SuccBlock : successors(Block)) {
+          CausedJoin |= visitEdge(*SuccBlock, *Label);
+          LoweredFloorIdx =
+              std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(SuccBlock));
+        }
+      }
+
+      // Floor update
+      if (CausedJoin) {
+        // 1. Different labels pushed to successors
+        FloorIdx = LoweredFloorIdx;
+      } else if (FloorLabel != Label) {
+        // 2. No join caused BUT we pushed a label that is 
diff erent than the
+        // last pushed label
+        FloorIdx = LoweredFloorIdx;
+        FloorLabel = Label;
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "Final labeling:\n"; printDefs(dbgs()));
+
+    // Check every cycle containing DivTermBlock for exit divergence.
+    // A cycle has exit divergence if the label of an exit block does
+    // not match the label of its header.
+    for (const auto *Cycle = CI.getCycle(&DivTermBlock); Cycle;
+         Cycle = Cycle->getParentCycle()) {
+      if (Cycle->isReducible()) {
+        // The exit divergence of a reducible cycle is recorded while
+        // propagating labels.
+        continue;
+      }
+      SmallVector<BlockT *> Exits;
+      Cycle->getExitBlocks(Exits);
+      auto *Header = Cycle->getHeader();
+      auto *HeaderLabel = BlockLabels[Header];
+      for (const auto *Exit : Exits) {
+        if (BlockLabels[Exit] != HeaderLabel) {
+          // Identified a divergent cycle exit
+          DivDesc->CycleDivBlocks.insert(Exit);
+          LLVM_DEBUG(dbgs() << "\tDivergent cycle exit: " << Context.print(Exit)
+                            << "\n");
+        }
+      }
+    }
+
+    return std::move(DivDesc);
+  }
+};
+
+template <typename ContextT>
+typename llvm::GenericSyncDependenceAnalysis<ContextT>::DivergenceDescriptor
+    llvm::GenericSyncDependenceAnalysis<ContextT>::EmptyDivergenceDesc;
+
+template <typename ContextT>
+llvm::GenericSyncDependenceAnalysis<ContextT>::GenericSyncDependenceAnalysis(
+    const ContextT &Context, const DominatorTreeT &DT, const CycleInfoT &CI)
+    : CyclePO(Context), DT(DT), CI(CI) {
+  CyclePO.compute(CI);
+}
+
+template <typename ContextT>
+auto llvm::GenericSyncDependenceAnalysis<ContextT>::getJoinBlocks(
+    const BlockT *DivTermBlock) -> const DivergenceDescriptor & {
+  // trivial case
+  if (succ_size(DivTermBlock) <= 1) {
+    return EmptyDivergenceDesc;
+  }
+
+  // already available in cache?
+  auto ItCached = CachedControlDivDescs.find(DivTermBlock);
+  if (ItCached != CachedControlDivDescs.end())
+    return *ItCached->second;
+
+  // compute all join points
+  DivergencePropagatorT Propagator(CyclePO, DT, CI, *DivTermBlock);
+  auto DivDesc = Propagator.computeJoinPoints();
+
+  auto printBlockSet = [&](ConstBlockSet &Blocks) {
+    return Printable([&](raw_ostream &Out) {
+      Out << "[";
+      ListSeparator LS;
+      for (const auto *BB : Blocks) {
+        Out << LS << CI.getSSAContext().print(BB);
+      }
+      Out << "]\n";
+    });
+  };
+
+  LLVM_DEBUG(
+      dbgs() << "\nResult (" << CI.getSSAContext().print(DivTermBlock)
+             << "):\n  JoinDivBlocks: " << printBlockSet(DivDesc->JoinDivBlocks)
+             << "  CycleDivBlocks: " << printBlockSet(DivDesc->CycleDivBlocks)
+             << "\n");
+
+  auto ItInserted =
+      CachedControlDivDescs.try_emplace(DivTermBlock, std::move(DivDesc));
+  assert(ItInserted.second);
+  return *ItInserted.first->second;
+}
+
+template <typename ContextT>
+bool GenericUniformityAnalysisImpl<ContextT>::markDivergent(
+    const InstructionT &I) {
+  if (I.isTerminator()) {
+    if (DivergentTermBlocks.insert(I.getParent()).second) {
+      LLVM_DEBUG(dbgs() << "marked divergent term block: "
+                        << Context.print(I.getParent()) << "\n");
+      return true;
+    }
+    return false;
+  }
+
+  return markDefsDivergent(I);
+}
+
+template <typename ContextT>
+bool GenericUniformityAnalysisImpl<ContextT>::markDivergent(
+    ConstValueRefT Val) {
+  if (DivergentValues.insert(Val).second) {
+    LLVM_DEBUG(dbgs() << "marked divergent: " << Context.print(Val) << "\n");
+    return true;
+  }
+  return false;
+}
+
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::addUniformOverride(
+    const InstructionT &Instr) {
+  UniformOverrides.insert(&Instr);
+}
+
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::analyzeTemporalDivergence(
+    const InstructionT &I, const CycleT &OuterDivCycle) {
+  if (isDivergent(I))
+    return;
+
+  LLVM_DEBUG(dbgs() << "Analyze temporal divergence: " << Context.print(&I)
+                    << "\n");
+  if (!usesValueFromCycle(I, OuterDivCycle))
+    return;
+
+  if (isAlwaysUniform(I))
+    return;
+
+  if (markDivergent(I))
+    Worklist.push_back(&I);
+}
+
+// Mark all external users of values defined inside \param
+// OuterDivCycle as divergent.
+//
+// This follows all live out edges wherever they may lead. Potential
+// users of values defined inside DivCycle could be anywhere in the
+// dominance region of DivCycle (including its fringes for phi nodes).
+// A cycle C dominates a block B iff every path from the entry block
+// to B must pass through a block contained in C. If C is a reducible
+// cycle (or natural loop), C dominates B iff the header of C
+// dominates B. But in general, we iteratively examine cycle cycle
+// exits and their successors.
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::analyzeCycleExitDivergence(
+    const CycleT &OuterDivCycle) {
+  // Set of blocks that are dominated by the cycle, i.e., each is only
+  // reachable from paths that pass through the cycle.
+  SmallPtrSet<BlockT *, 16> DomRegion;
+
+  // The boundary of DomRegion, formed by blocks that are not
+  // dominated by the cycle.
+  SmallVector<BlockT *> DomFrontier;
+  OuterDivCycle.getExitBlocks(DomFrontier);
+
+  // Returns true if BB is dominated by the cycle.
+  auto isInDomRegion = [&](BlockT *BB) {
+    for (auto *P : predecessors(BB)) {
+      if (OuterDivCycle.contains(P))
+        continue;
+      if (DomRegion.count(P))
+        continue;
+      return false;
+    }
+    return true;
+  };
+
+  // Keep advancing the frontier along successor edges, while
+  // promoting blocks to DomRegion.
+  while (true) {
+    bool Promoted = false;
+    SmallVector<BlockT *> Temp;
+    for (auto *W : DomFrontier) {
+      if (!isInDomRegion(W)) {
+        Temp.push_back(W);
+        continue;
+      }
+      DomRegion.insert(W);
+      Promoted = true;
+      for (auto *Succ : successors(W)) {
+        if (DomRegion.contains(Succ))
+          continue;
+        Temp.push_back(Succ);
+      }
+    }
+    if (!Promoted)
+      break;
+    DomFrontier = Temp;
+  }
+
+  // At DomFrontier, only the PHI nodes are affected by temporal
+  // divergence.
+  for (const auto *UserBlock : DomFrontier) {
+    LLVM_DEBUG(dbgs() << "Analyze phis after cycle exit: "
+                      << Context.print(UserBlock) << "\n");
+    for (const auto &Phi : UserBlock->phis()) {
+      LLVM_DEBUG(dbgs() << "  " << Context.print(&Phi) << "\n");
+      analyzeTemporalDivergence(Phi, OuterDivCycle);
+    }
+  }
+
+  // All instructions inside the dominance region are affected by
+  // temporal divergence.
+  for (const auto *UserBlock : DomRegion) {
+    LLVM_DEBUG(dbgs() << "Analyze non-phi users after cycle exit: "
+                      << Context.print(UserBlock) << "\n");
+    for (const auto &I : *UserBlock) {
+      LLVM_DEBUG(dbgs() << "  " << Context.print(&I) << "\n");
+      analyzeTemporalDivergence(I, OuterDivCycle);
+    }
+  }
+}
+
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::propagateCycleExitDivergence(
+    const BlockT &DivExit, const CycleT &InnerDivCycle) {
+  LLVM_DEBUG(dbgs() << "\tpropCycleExitDiv " << Context.print(&DivExit)
+                    << "\n");
+  auto *DivCycle = &InnerDivCycle;
+  auto *OuterDivCycle = DivCycle;
+  auto *ExitLevelCycle = CI.getCycle(&DivExit);
+  const unsigned CycleExitDepth =
+      ExitLevelCycle ? ExitLevelCycle->getDepth() : 0;
+
+  // Find outer-most cycle that does not contain \p DivExit
+  while (DivCycle && DivCycle->getDepth() > CycleExitDepth) {
+    LLVM_DEBUG(dbgs() << "  Found exiting cycle: "
+                      << Context.print(DivCycle->getHeader()) << "\n");
+    OuterDivCycle = DivCycle;
+    DivCycle = DivCycle->getParentCycle();
+  }
+  LLVM_DEBUG(dbgs() << "\tOuter-most exiting cycle: "
+                    << Context.print(OuterDivCycle->getHeader()) << "\n");
+
+  if (!DivergentExitCycles.insert(OuterDivCycle).second)
+    return;
+
+  // Exit divergence does not matter if the cycle itself is assumed to
+  // be divergent.
+  for (const auto *C : AssumedDivergent) {
+    if (C->contains(OuterDivCycle))
+      return;
+  }
+
+  analyzeCycleExitDivergence(*OuterDivCycle);
+}
+
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::taintAndPushAllDefs(
+    const BlockT &BB) {
+  LLVM_DEBUG(dbgs() << "taintAndPushAllDefs " << Context.print(&BB) << "\n");
+  for (const auto &I : instrs(BB)) {
+    // Terminators do not produce values; they are divergent only if
+    // the condition is divergent. That is handled when the divergent
+    // condition is placed in the worklist.
+    if (I.isTerminator())
+      break;
+
+    // Mark this as divergent. We don't check if the instruction is
+    // always uniform. In a cycle where the thread convergence is not
+    // statically known, the instruction is not statically converged,
+    // and its outputs cannot be statically uniform.
+    if (markDivergent(I))
+      Worklist.push_back(&I);
+  }
+}
+
+/// Mark divergent phi nodes in a join block
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
+    const BlockT &JoinBlock) {
+  LLVM_DEBUG(dbgs() << "taintAndPushPhiNodes in " << Context.print(&JoinBlock)
+                    << "\n");
+  for (const auto &Phi : JoinBlock.phis()) {
+    if (ContextT::isConstantValuePhi(Phi))
+      continue;
+    if (markDivergent(Phi))
+      Worklist.push_back(&Phi);
+  }
+}
+
+/// Add \p Candidate to \p Cycles if it is not already contained in \p Cycles.
+///
+/// \return true iff \p Candidate was added to \p Cycles.
+template <typename CycleT>
+static bool insertIfNotContained(SmallVector<CycleT *> &Cycles,
+                                 CycleT *Candidate) {
+  if (llvm::any_of(Cycles,
+                   [Candidate](CycleT *C) { return C->contains(Candidate); }))
+    return false;
+  Cycles.push_back(Candidate);
+  return true;
+}
+
+/// Return the outermost cycle made divergent by branch outside it.
+///
+/// If two paths that diverged outside an irreducible cycle join
+/// inside that cycle, then that whole cycle is assumed to be
+/// divergent. This does not apply if the cycle is reducible.
+template <typename CycleT, typename BlockT>
+static const CycleT *getExtDivCycle(const CycleT *Cycle,
+                                    const BlockT *DivTermBlock,
+                                    const BlockT *JoinBlock) {
+  assert(Cycle);
+  assert(Cycle->contains(JoinBlock));
+
+  if (Cycle->contains(DivTermBlock))
+    return nullptr;
+
+  if (Cycle->isReducible()) {
+    assert(Cycle->getHeader() == JoinBlock);
+    return nullptr;
+  }
+
+  const auto *Parent = Cycle->getParentCycle();
+  while (Parent && !Parent->contains(DivTermBlock)) {
+    // If the join is inside a child, then the parent must be
+    // irreducible. The only join in a reducible cyle is its own
+    // header.
+    assert(!Parent->isReducible());
+    Cycle = Parent;
+    Parent = Cycle->getParentCycle();
+  }
+
+  LLVM_DEBUG(dbgs() << "cycle made divergent by external branch\n");
+  return Cycle;
+}
+
+/// Return the outermost cycle made divergent by branch inside it.
+///
+/// This checks the "diverged entry" criterion defined in the
+/// docs/ConvergenceAnalysis.html.
+template <typename ContextT, typename CycleT, typename BlockT,
+          typename DominatorTreeT>
+static const CycleT *
+getIntDivCycle(const CycleT *Cycle, const BlockT *DivTermBlock,
+               const BlockT *JoinBlock, const DominatorTreeT &DT,
+               ContextT &Context) {
+  LLVM_DEBUG(dbgs() << "examine join " << Context.print(JoinBlock)
+                    << "for internal branch " << Context.print(DivTermBlock)
+                    << "\n");
+  if (DT.properlyDominates(DivTermBlock, JoinBlock))
+    return nullptr;
+
+  // Find the smallest common cycle, if one exists.
+  assert(Cycle && Cycle->contains(JoinBlock));
+  while (Cycle && !Cycle->contains(DivTermBlock)) {
+    Cycle = Cycle->getParentCycle();
+  }
+  if (!Cycle || Cycle->isReducible())
+    return nullptr;
+
+  if (DT.properlyDominates(Cycle->getHeader(), JoinBlock))
+    return nullptr;
+
+  LLVM_DEBUG(dbgs() << "  header " << Context.print(Cycle->getHeader())
+                    << " does not dominate join\n");
+
+  const auto *Parent = Cycle->getParentCycle();
+  while (Parent && !DT.properlyDominates(Parent->getHeader(), JoinBlock)) {
+    LLVM_DEBUG(dbgs() << "  header " << Context.print(Parent->getHeader())
+                      << " does not dominate join\n");
+    Cycle = Parent;
+    Parent = Parent->getParentCycle();
+  }
+
+  LLVM_DEBUG(dbgs() << "  cycle made divergent by internal branch\n");
+  return Cycle;
+}
+
+template <typename ContextT, typename CycleT, typename BlockT,
+          typename DominatorTreeT>
+static const CycleT *
+getOutermostDivergentCycle(const CycleT *Cycle, const BlockT *DivTermBlock,
+                           const BlockT *JoinBlock, const DominatorTreeT &DT,
+                           ContextT &Context) {
+  if (!Cycle)
+    return nullptr;
+
+  // First try to expand Cycle to the largest that contains JoinBlock
+  // but not DivTermBlock.
+  const auto *Ext = getExtDivCycle(Cycle, DivTermBlock, JoinBlock);
+
+  // Continue expanding to the largest cycle that contains both.
+  const auto *Int = getIntDivCycle(Cycle, DivTermBlock, JoinBlock, DT, Context);
+
+  if (Int)
+    return Int;
+  return Ext;
+}
+
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::analyzeControlDivergence(
+    const InstructionT &Term) {
+  const auto *DivTermBlock = Term.getParent();
+  DivergentTermBlocks.insert(DivTermBlock);
+  LLVM_DEBUG(dbgs() << "analyzeControlDiv " << Context.print(DivTermBlock)
+                    << "\n");
+
+  // Don't propagate divergence from unreachable blocks.
+  if (!DT.isReachableFromEntry(DivTermBlock))
+    return;
+
+  const auto &DivDesc = SDA.getJoinBlocks(DivTermBlock);
+  SmallVector<const CycleT *> DivCycles;
+
+  // Iterate over all blocks now reachable by a disjoint path join
+  for (const auto *JoinBlock : DivDesc.JoinDivBlocks) {
+    const auto *Cycle = CI.getCycle(JoinBlock);
+    LLVM_DEBUG(dbgs() << "visiting join block " << Context.print(JoinBlock)
+                      << "\n");
+    if (const auto *Outermost = getOutermostDivergentCycle(
+            Cycle, DivTermBlock, JoinBlock, DT, Context)) {
+      LLVM_DEBUG(dbgs() << "found divergent cycle\n");
+      DivCycles.push_back(Outermost);
+      continue;
+    }
+    taintAndPushPhiNodes(*JoinBlock);
+  }
+
+  // Sort by order of decreasing depth. This allows later cycles to be skipped
+  // because they are already contained in earlier ones.
+  llvm::sort(DivCycles, [](const CycleT *A, const CycleT *B) {
+    return A->getDepth() > B->getDepth();
+  });
+
+  // Cycles that are assumed divergent due to the diverged entry
+  // criterion potentially contain temporal divergence depending on
+  // the DFS chosen. Conservatively, all values produced in such a
+  // cycle are assumed divergent. "Cycle invariant" values may be
+  // assumed uniform, but that requires further analysis.
+  for (auto *C : DivCycles) {
+    if (!insertIfNotContained(AssumedDivergent, C))
+      continue;
+    LLVM_DEBUG(dbgs() << "process divergent cycle\n");
+    for (const BlockT *BB : C->blocks()) {
+      taintAndPushAllDefs(*BB);
+    }
+  }
+
+  const auto *BranchCycle = CI.getCycle(DivTermBlock);
+  assert(DivDesc.CycleDivBlocks.empty() || BranchCycle);
+  for (const auto *DivExitBlock : DivDesc.CycleDivBlocks) {
+    propagateCycleExitDivergence(*DivExitBlock, *BranchCycle);
+  }
+}
+
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::compute() {
+  // Initialize worklist.
+  auto DivValuesCopy = DivergentValues;
+  for (const auto DivVal : DivValuesCopy) {
+    assert(isDivergent(DivVal) && "Worklist invariant violated!");
+    pushUsers(DivVal);
+  }
+
+  // All values on the Worklist are divergent.
+  // Their users may not have been updated yet.
+  while (!Worklist.empty()) {
+    const InstructionT *I = Worklist.back();
+    Worklist.pop_back();
+
+    LLVM_DEBUG(dbgs() << "worklist pop: " << Context.print(I) << "\n");
+
+    if (I->isTerminator()) {
+      analyzeControlDivergence(*I);
+      continue;
+    }
+
+    // propagate value divergence to users
+    assert(isDivergent(*I) && "Worklist invariant violated!");
+    pushUsers(*I);
+  }
+}
+
+template <typename ContextT>
+bool GenericUniformityAnalysisImpl<ContextT>::isAlwaysUniform(
+    const InstructionT &Instr) const {
+  return UniformOverrides.contains(&Instr);
+}
+
+template <typename ContextT>
+GenericUniformityInfo<ContextT>::GenericUniformityInfo(
+    FunctionT &Func, const DominatorTreeT &DT, const CycleInfoT &CI,
+    const TargetTransformInfo *TTI)
+    : F(&Func) {
+  DA.reset(new ImplT{Func, DT, CI, TTI});
+  DA->initialize();
+  DA->compute();
+}
+
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
+  bool haveDivergentArgs = false;
+
+  if (DivergentValues.empty()) {
+    assert(DivergentTermBlocks.empty());
+    assert(DivergentExitCycles.empty());
+    OS << "ALL VALUES UNIFORM\n";
+    return;
+  }
+
+  for (const auto &entry : DivergentValues) {
+    const BlockT *parent = Context.getDefBlock(entry);
+    if (!parent) {
+      if (!haveDivergentArgs) {
+        OS << "DIVERGENT ARGUMENTS:\n";
+        haveDivergentArgs = true;
+      }
+      OS << "  DIVERGENT: " << Context.print(entry) << '\n';
+    }
+  }
+
+  if (!AssumedDivergent.empty()) {
+    OS << "CYCLES ASSSUMED DIVERGENT:\n";
+    for (const CycleT *cycle : AssumedDivergent) {
+      OS << "  " << cycle->print(Context) << '\n';
+    }
+  }
+
+  if (!DivergentExitCycles.empty()) {
+    OS << "CYCLES WITH DIVERGENT EXIT:\n";
+    for (const CycleT *cycle : DivergentExitCycles) {
+      OS << "  " << cycle->print(Context) << '\n';
+    }
+  }
+
+  for (auto &block : F) {
+    OS << "\nBLOCK " << Context.print(&block) << '\n';
+
+    OS << "DEFINITIONS\n";
+    SmallVector<ConstValueRefT, 16> defs;
+    Context.appendBlockDefs(defs, block);
+    for (auto value : defs) {
+      if (isDivergent(value))
+        OS << "  DIVERGENT: ";
+      else
+        OS << "             ";
+      OS << Context.print(value) << '\n';
+    }
+
+    OS << "TERMINATORS\n";
+    SmallVector<const InstructionT *, 8> terms;
+    Context.appendBlockTerms(terms, block);
+    bool divergentTerminators = hasDivergentTerminator(block);
+    for (auto *T : terms) {
+      if (divergentTerminators)
+        OS << "  DIVERGENT: ";
+      else
+        OS << "             ";
+      OS << Context.print(T) << '\n';
+    }
+
+    OS << "END BLOCK\n";
+  }
+}
+
+template <typename ContextT>
+bool GenericUniformityInfo<ContextT>::hasDivergence() const {
+  return DA->hasDivergence();
+}
+
+/// Whether \p V is divergent at its definition.
+template <typename ContextT>
+bool GenericUniformityInfo<ContextT>::isDivergent(ConstValueRefT V) const {
+  return DA->isDivergent(V);
+}
+
+template <typename ContextT>
+bool GenericUniformityInfo<ContextT>::hasDivergentTerminator(const BlockT &B) {
+  return DA->hasDivergentTerminator(B);
+}
+
+/// \brief T helper function for printing.
+template <typename ContextT>
+void GenericUniformityInfo<ContextT>::print(raw_ostream &out) const {
+  DA->print(out);
+}
+
+template <typename ContextT>
+void llvm::ModifiedPostOrder<ContextT>::computeStackPO(
+    SmallVectorImpl<BlockT *> &Stack, const CycleInfoT &CI, const CycleT *Cycle,
+    SmallPtrSetImpl<BlockT *> &Finalized) {
+  LLVM_DEBUG(dbgs() << "inside computeStackPO\n");
+  while (!Stack.empty()) {
+    auto *NextBB = Stack.back();
+    if (Finalized.count(NextBB)) {
+      Stack.pop_back();
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "  visiting " << CI.getSSAContext().print(NextBB)
+                      << "\n");
+    auto *NestedCycle = CI.getCycle(NextBB);
+    if (Cycle != NestedCycle && (!Cycle || Cycle->contains(NestedCycle))) {
+      LLVM_DEBUG(dbgs() << "  found a cycle\n");
+      while (NestedCycle->getParentCycle() != Cycle)
+        NestedCycle = NestedCycle->getParentCycle();
+
+      SmallVector<BlockT *, 3> NestedExits;
+      NestedCycle->getExitBlocks(NestedExits);
+      bool PushedNodes = false;
+      for (auto *NestedExitBB : NestedExits) {
+        LLVM_DEBUG(dbgs() << "  examine exit: "
+                          << CI.getSSAContext().print(NestedExitBB) << "\n");
+        if (Cycle && !Cycle->contains(NestedExitBB))
+          continue;
+        if (Finalized.count(NestedExitBB))
+          continue;
+        PushedNodes = true;
+        Stack.push_back(NestedExitBB);
+        LLVM_DEBUG(dbgs() << "  pushed exit: "
+                          << CI.getSSAContext().print(NestedExitBB) << "\n");
+      }
+      if (!PushedNodes) {
+        // All loop exits finalized -> finish this node
+        Stack.pop_back();
+        computeCyclePO(CI, NestedCycle, Finalized);
+      }
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "  no nested cycle, going into DAG\n");
+    // DAG-style
+    bool PushedNodes = false;
+    for (auto *SuccBB : successors(NextBB)) {
+      LLVM_DEBUG(dbgs() << "  examine succ: "
+                        << CI.getSSAContext().print(SuccBB) << "\n");
+      if (Cycle && !Cycle->contains(SuccBB))
+        continue;
+      if (Finalized.count(SuccBB))
+        continue;
+      PushedNodes = true;
+      Stack.push_back(SuccBB);
+      LLVM_DEBUG(dbgs() << "  pushed succ: " << CI.getSSAContext().print(SuccBB)
+                        << "\n");
+    }
+    if (!PushedNodes) {
+      // Never push nodes twice
+      LLVM_DEBUG(dbgs() << "  finishing node: "
+                        << CI.getSSAContext().print(NextBB) << "\n");
+      Stack.pop_back();
+      Finalized.insert(NextBB);
+      appendBlock(*NextBB);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "exited computeStackPO\n");
+}
+
+template <typename ContextT>
+void ModifiedPostOrder<ContextT>::computeCyclePO(
+    const CycleInfoT &CI, const CycleT *Cycle,
+    SmallPtrSetImpl<BlockT *> &Finalized) {
+  LLVM_DEBUG(dbgs() << "inside computeCyclePO\n");
+  SmallVector<BlockT *> Stack;
+  auto *CycleHeader = Cycle->getHeader();
+
+  LLVM_DEBUG(dbgs() << "  noted header: "
+                    << CI.getSSAContext().print(CycleHeader) << "\n");
+  assert(!Finalized.count(CycleHeader));
+  Finalized.insert(CycleHeader);
+
+  // Visit the header last
+  LLVM_DEBUG(dbgs() << "  finishing header: "
+                    << CI.getSSAContext().print(CycleHeader) << "\n");
+  appendBlock(*CycleHeader, Cycle->isReducible());
+
+  // Initialize with immediate successors
+  for (auto *BB : successors(CycleHeader)) {
+    LLVM_DEBUG(dbgs() << "  examine succ: " << CI.getSSAContext().print(BB)
+                      << "\n");
+    if (!Cycle->contains(BB))
+      continue;
+    if (BB == CycleHeader)
+      continue;
+    if (!Finalized.count(BB)) {
+      LLVM_DEBUG(dbgs() << "  pushed succ: " << CI.getSSAContext().print(BB)
+                        << "\n");
+      Stack.push_back(BB);
+    }
+  }
+
+  // Compute PO inside region
+  computeStackPO(Stack, CI, Cycle, Finalized);
+
+  LLVM_DEBUG(dbgs() << "exited computeCyclePO\n");
+}
+
+/// \brief Generically compute the modified post order.
+template <typename ContextT>
+void llvm::ModifiedPostOrder<ContextT>::compute(const CycleInfoT &CI) {
+  SmallPtrSet<BlockT *, 32> Finalized;
+  SmallVector<BlockT *> Stack;
+  auto *F = CI.getFunction();
+  Stack.reserve(24); // FIXME made-up number
+  Stack.push_back(GraphTraits<FunctionT *>::getEntryNode(F));
+  computeStackPO(Stack, CI, nullptr, Finalized);
+}
+
+} // namespace llvm
+
+#undef DEBUG_TYPE
+
+#endif // LLVM_ADT_GENERICUNIFORMITYIMPL_H

diff  --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h
new file mode 100644
index 0000000000000..c4d8c8098dd22
--- /dev/null
+++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h
@@ -0,0 +1,79 @@
+//===- GenericUniformityInfo.h ---------------------------*- C++ -*--------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_GENERICUNIFORMITYINFO_H
+#define LLVM_ADT_GENERICUNIFORMITYINFO_H
+
+// #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/GenericCycleInfo.h"
+// #include "llvm/ADT/SmallPtrSet.h"
+// #include "llvm/ADT/Uniformity.h"
+// #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class TargetTransformInfo;
+
+template <typename ContextT> class GenericUniformityAnalysisImpl;
+
+template <typename ContextT> class GenericUniformityInfo {
+public:
+  using BlockT = typename ContextT::BlockT;
+  using FunctionT = typename ContextT::FunctionT;
+  using ValueRefT = typename ContextT::ValueRefT;
+  using ConstValueRefT = typename ContextT::ConstValueRefT;
+  using InstructionT = typename ContextT::InstructionT;
+  using DominatorTreeT = typename ContextT::DominatorTreeT;
+  using ThisT = GenericUniformityInfo<ContextT>;
+
+  using CycleInfoT = GenericCycleInfo<ContextT>;
+  using CycleT = typename CycleInfoT::CycleT;
+
+  GenericUniformityInfo(FunctionT &F, const DominatorTreeT &DT,
+                        const CycleInfoT &CI,
+                        const TargetTransformInfo *TTI = nullptr);
+  GenericUniformityInfo() = default;
+  GenericUniformityInfo(GenericUniformityInfo &&) = default;
+  GenericUniformityInfo &operator=(GenericUniformityInfo &&) = default;
+
+  /// Whether any divergence was detected.
+  bool hasDivergence() const;
+
+  /// The GPU kernel this analysis result is for
+  const FunctionT &getFunction() const { return *F; }
+
+  /// Whether \p V is divergent at its definition.
+  bool isDivergent(ConstValueRefT V) const;
+
+  /// Whether \p V is uniform/non-divergent.
+  bool isUniform(ConstValueRefT V) const { return !isDivergent(V); }
+
+  bool hasDivergentTerminator(const BlockT &B);
+
+  void print(raw_ostream &Out) const;
+
+private:
+  using ImplT = GenericUniformityAnalysisImpl<ContextT>;
+  struct ImplDeleter {
+    void operator()(GenericUniformityAnalysisImpl<ContextT> *Impl);
+  };
+
+  FunctionT *F;
+  std::unique_ptr<ImplT, ImplDeleter> DA;
+
+  GenericUniformityInfo(const GenericUniformityInfo &) = delete;
+  GenericUniformityInfo &operator=(const GenericUniformityInfo &) = delete;
+};
+
+} // namespace llvm
+
+#endif // LLVM_ADT_GENERICUNIFORMITYINFO_H

diff  --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
new file mode 100644
index 0000000000000..adba1cd618324
--- /dev/null
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -0,0 +1,33 @@
+//===- Uniformity.h --------------------------------------*- C++ -*--------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_UNIFORMITY_H
+#define LLVM_ADT_UNIFORMITY_H
+
+namespace llvm {
+
+/// Enum describing how instructions behave with respect to uniformity and
+/// divergence, to answer the question: if the same instruction is executed by
+/// two threads in a convergent set of threads, will its result value(s) be
+/// uniform, i.e. the same on both threads?
+enum class InstructionUniformity {
+  /// The result values are uniform if and only if all operands are uniform.
+  Default,
+
+  /// The result values are always uniform.
+  AlwaysUniform,
+
+  /// The result values can never be assumed to be uniform.
+  NeverUniform
+};
+
+} // namespace llvm
+#endif // LLVM_ADT_UNIFORMITY_H

diff  --git a/llvm/include/llvm/Analysis/UniformityAnalysis.h b/llvm/include/llvm/Analysis/UniformityAnalysis.h
new file mode 100644
index 0000000000000..66ce480d4b761
--- /dev/null
+++ b/llvm/include/llvm/Analysis/UniformityAnalysis.h
@@ -0,0 +1,78 @@
+//===- ConvergenceUtils.h -----------------------*- C++ -*-----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Convergence info and convergence-aware uniform info for LLVM IR
+///
+/// This 
diff ers from traditional divergence analysis by taking convergence
+/// intrinsics into account.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_UNIFORMITYANALYSIS_H
+#define LLVM_ANALYSIS_UNIFORMITYANALYSIS_H
+
+#include "llvm/ADT/GenericUniformityInfo.h"
+#include "llvm/Analysis/CycleAnalysis.h"
+
+namespace llvm {
+
+extern template class GenericUniformityInfo<SSAContext>;
+using UniformityInfo = GenericUniformityInfo<SSAContext>;
+
+/// Analysis pass which computes \ref UniformityInfo.
+class UniformityInfoAnalysis
+    : public AnalysisInfoMixin<UniformityInfoAnalysis> {
+  friend AnalysisInfoMixin<UniformityInfoAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  /// Provide the result typedef for this analysis pass.
+  using Result = UniformityInfo;
+
+  /// Run the analysis pass over a function and produce a dominator tree.
+  UniformityInfo run(Function &F, FunctionAnalysisManager &);
+
+  // TODO: verify analysis
+};
+
+/// Printer pass for the \c UniformityInfo.
+class UniformityInfoPrinterPass
+    : public PassInfoMixin<UniformityInfoPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit UniformityInfoPrinterPass(raw_ostream &OS);
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// Legacy analysis pass which computes a \ref CycleInfo.
+class UniformityInfoWrapperPass : public FunctionPass {
+  Function *m_function = nullptr;
+  UniformityInfo m_uniformityInfo;
+
+public:
+  static char ID;
+
+  UniformityInfoWrapperPass();
+
+  UniformityInfo &getUniformityInfo() { return m_uniformityInfo; }
+  const UniformityInfo &getUniformityInfo() const { return m_uniformityInfo; }
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+
+  // TODO: verify analysis
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_UNIFORMITYANALYSIS_H

diff  --git a/llvm/include/llvm/CodeGen/MachineCycleAnalysis.h b/llvm/include/llvm/CodeGen/MachineCycleAnalysis.h
index 3f89f2076d503..1c210632f9e1a 100644
--- a/llvm/include/llvm/CodeGen/MachineCycleAnalysis.h
+++ b/llvm/include/llvm/CodeGen/MachineCycleAnalysis.h
@@ -15,9 +15,8 @@
 #define LLVM_CODEGEN_MACHINECYCLEANALYSIS_H
 
 #include "llvm/ADT/GenericCycleInfo.h"
-#include "llvm/CodeGen/MachineSSAContext.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/InitializePasses.h"
+#include "llvm/CodeGen/MachineSSAContext.h"
 
 namespace llvm {
 

diff  --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
index 9a13be7e70477..8efd1d2e95e93 100644
--- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def
+++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
@@ -205,4 +205,6 @@ DUMMY_MACHINE_FUNCTION_PASS("reset-machine-function", ResetMachineFunctionPass,
 DUMMY_MACHINE_FUNCTION_PASS("machineverifier", MachineVerifierPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("print-machine-cycles", MachineCycleInfoPrinterPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("machine-sanmd", MachineSanitizerBinaryMetadata, ())
+DUMMY_MACHINE_FUNCTION_PASS("machine-uniformity", MachineUniformityInfoWrapperPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("print-machine-uniformity", MachineUniformityInfoPrinterPass, ())
 #undef DUMMY_MACHINE_FUNCTION_PASS

diff  --git a/llvm/include/llvm/CodeGen/MachineSSAContext.h b/llvm/include/llvm/CodeGen/MachineSSAContext.h
index f59d7cf8a5229..e3b2dc4598810 100644
--- a/llvm/include/llvm/CodeGen/MachineSSAContext.h
+++ b/llvm/include/llvm/CodeGen/MachineSSAContext.h
@@ -26,10 +26,17 @@ class Register;
 template <typename _FunctionT> class GenericSSAContext;
 template <typename, bool> class DominatorTreeBase;
 
-inline auto successors(MachineBasicBlock *BB) { return BB->successors(); }
-inline auto predecessors(MachineBasicBlock *BB) { return BB->predecessors(); }
-inline unsigned succ_size(MachineBasicBlock *BB) { return BB->succ_size(); }
-inline unsigned pred_size(MachineBasicBlock *BB) { return BB->pred_size(); }
+inline auto successors(const MachineBasicBlock *BB) { return BB->successors(); }
+inline auto predecessors(const MachineBasicBlock *BB) {
+  return BB->predecessors();
+}
+inline unsigned succ_size(const MachineBasicBlock *BB) {
+  return BB->succ_size();
+}
+inline unsigned pred_size(const MachineBasicBlock *BB) {
+  return BB->pred_size();
+}
+inline auto instrs(const MachineBasicBlock &BB) { return BB.instrs(); }
 
 template <> class GenericSSAContext<MachineFunction> {
   const MachineRegisterInfo *RegInfo = nullptr;
@@ -40,15 +47,25 @@ template <> class GenericSSAContext<MachineFunction> {
   using FunctionT = MachineFunction;
   using InstructionT = MachineInstr;
   using ValueRefT = Register;
+  using ConstValueRefT = Register;
+  static const Register ValueRefNull;
   using DominatorTreeT = DominatorTreeBase<BlockT, false>;
 
-  static MachineBasicBlock *getEntryBlock(MachineFunction &F);
-
   void setFunction(MachineFunction &Fn);
   MachineFunction *getFunction() const { return MF; }
 
-  Printable print(MachineBasicBlock *Block) const;
-  Printable print(MachineInstr *Inst) const;
+  static MachineBasicBlock *getEntryBlock(MachineFunction &F);
+  static void appendBlockDefs(SmallVectorImpl<Register> &defs,
+                              const MachineBasicBlock &block);
+  static void appendBlockTerms(SmallVectorImpl<MachineInstr *> &terms,
+                               MachineBasicBlock &block);
+  static void appendBlockTerms(SmallVectorImpl<const MachineInstr *> &terms,
+                               const MachineBasicBlock &block);
+  MachineBasicBlock *getDefBlock(Register) const;
+  static bool isConstantValuePhi(const MachineInstr &Phi);
+
+  Printable print(const MachineBasicBlock *Block) const;
+  Printable print(const MachineInstr *Inst) const;
   Printable print(Register Value) const;
 };
 

diff  --git a/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h b/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
new file mode 100644
index 0000000000000..614f09caa3c78
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
@@ -0,0 +1,35 @@
+//===- MachineUniformityAnalysis.h ---------------------------*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Uniformity info and uniformity-aware uniform info for Machine IR
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINEUNIFORMITYANALYSIS_H
+#define LLVM_CODEGEN_MACHINEUNIFORMITYANALYSIS_H
+
+#include "llvm/ADT/GenericUniformityInfo.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineSSAContext.h"
+
+namespace llvm {
+
+extern template class GenericUniformityInfo<MachineSSAContext>;
+using MachineUniformityInfo = GenericUniformityInfo<MachineSSAContext>;
+
+/// \brief Compute the uniform information of a Machine IR function.
+MachineUniformityInfo
+computeMachineUniformityInfo(MachineFunction &F,
+                             const MachineCycleInfo &cycleInfo,
+                             const MachineDomTree &domTree);
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_MACHINEUNIFORMITYANALYSIS_H

diff  --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 3a30693fafa1d..1f95b630e26b7 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -16,6 +16,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Uniformity.h"
 #include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -2053,6 +2055,12 @@ class TargetInstrInfo : public MCInstrInfo {
     return MI.getOperand(0);
   }
 
+  /// Return the uniformity behavior of the given instruction.
+  virtual InstructionUniformity
+  getInstructionUniformity(const MachineInstr &MI) const {
+    return InstructionUniformity::Default;
+  }
+
 private:
   mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;

diff  --git a/llvm/include/llvm/IR/SSAContext.h b/llvm/include/llvm/IR/SSAContext.h
index 8ca23e3ee0774..7551adff1e127 100644
--- a/llvm/include/llvm/IR/SSAContext.h
+++ b/llvm/include/llvm/IR/SSAContext.h
@@ -15,15 +15,24 @@
 #ifndef LLVM_IR_SSACONTEXT_H
 #define LLVM_IR_SSACONTEXT_H
 
+#include "llvm/ADT/GenericSSAContext.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/Support/Printable.h"
 
+#include <memory>
+
 namespace llvm {
 class BasicBlock;
 class Function;
 class Instruction;
 class Value;
+template <typename> class SmallVectorImpl;
 template <typename, bool> class DominatorTreeBase;
-template <typename _FunctionT> class GenericSSAContext;
+
+inline auto instrs(const BasicBlock &BB) {
+  return llvm::make_range(BB.begin(), BB.end());
+}
 
 template <> class GenericSSAContext<Function> {
   Function *F;
@@ -33,16 +42,33 @@ template <> class GenericSSAContext<Function> {
   using FunctionT = Function;
   using InstructionT = Instruction;
   using ValueRefT = Value *;
+  using ConstValueRefT = const Value *;
+  static Value *ValueRefNull;
   using DominatorTreeT = DominatorTreeBase<BlockT, false>;
 
-  static BasicBlock *getEntryBlock(Function &F);
-
   void setFunction(Function &Fn);
   Function *getFunction() const { return F; }
 
-  Printable print(BasicBlock *Block) const;
-  Printable print(Instruction *Inst) const;
-  Printable print(Value *Value) const;
+  static BasicBlock *getEntryBlock(Function &F);
+  static const BasicBlock *getEntryBlock(const Function &F);
+
+  static void appendBlockDefs(SmallVectorImpl<Value *> &defs,
+                              BasicBlock &block);
+  static void appendBlockDefs(SmallVectorImpl<const Value *> &defs,
+                              const BasicBlock &block);
+
+  static void appendBlockTerms(SmallVectorImpl<Instruction *> &terms,
+                               BasicBlock &block);
+  static void appendBlockTerms(SmallVectorImpl<const Instruction *> &terms,
+                               const BasicBlock &block);
+
+  static bool comesBefore(const Instruction *lhs, const Instruction *rhs);
+  static bool isConstantValuePhi(const Instruction &Instr);
+  const BasicBlock *getDefBlock(const Value *value) const;
+
+  Printable print(const BasicBlock *Block) const;
+  Printable print(const Instruction *Inst) const;
+  Printable print(const Value *Value) const;
 };
 
 using SSAContext = GenericSSAContext<Function>;

diff  --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index df0f7419b84cc..b5074907c7638 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -279,6 +279,8 @@ void initializeMachineSanitizerBinaryMetadataPass(PassRegistry &);
 void initializeMachineSchedulerPass(PassRegistry&);
 void initializeMachineSinkingPass(PassRegistry&);
 void initializeMachineTraceMetricsPass(PassRegistry&);
+void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &);
+void initializeMachineUniformityAnalysisPassPass(PassRegistry &);
 void initializeMachineVerifierPassPass(PassRegistry&);
 void initializeMemCpyOptLegacyPassPass(PassRegistry&);
 void initializeMemDepPrinterPass(PassRegistry&);
@@ -402,6 +404,7 @@ void initializeTLSVariableHoistLegacyPassPass(PassRegistry &);
 void initializeTwoAddressInstructionPassPass(PassRegistry&);
 void initializeTypeBasedAAWrapperPassPass(PassRegistry&);
 void initializeTypePromotionPass(PassRegistry&);
+void initializeUniformityInfoWrapperPassPass(PassRegistry &);
 void initializeUnifyFunctionExitNodesLegacyPassPass(PassRegistry &);
 void initializeUnifyLoopExitsLegacyPassPass(PassRegistry &);
 void initializeUnpackMachineBundlesPass(PassRegistry&);

diff  --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 3b6018aacf714..1c12275a86ff5 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -142,6 +142,7 @@ add_llvm_component_library(LLVMAnalysis
   TrainingLogger.cpp
   TypeBasedAliasAnalysis.cpp
   TypeMetadataUtils.cpp
+  UniformityAnalysis.cpp
   ScopedNoAliasAA.cpp
   ValueLattice.cpp
   ValueLatticeUtils.cpp

diff  --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
new file mode 100644
index 0000000000000..8194ac3ad7685
--- /dev/null
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -0,0 +1,159 @@
+//===- ConvergenceUtils.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/ADT/GenericUniformityImpl.h"
+#include "llvm/Analysis/CycleAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+template <>
+bool llvm::GenericUniformityAnalysisImpl<SSAContext>::hasDivergentDefs(
+    const Instruction &I) const {
+  return isDivergent((const Value *)&I);
+}
+
+template <>
+bool llvm::GenericUniformityAnalysisImpl<SSAContext>::markDefsDivergent(
+    const Instruction &Instr, bool AllDefsDivergent) {
+  return markDivergent(&Instr);
+}
+
+template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
+  for (auto &I : instructions(F)) {
+    if (TTI->isSourceOfDivergence(&I)) {
+      assert(!I.isTerminator());
+      markDivergent(I);
+    } else if (TTI->isAlwaysUniform(&I)) {
+      addUniformOverride(I);
+    }
+  }
+  for (auto &Arg : F.args()) {
+    if (TTI->isSourceOfDivergence(&Arg)) {
+      markDivergent(&Arg);
+    }
+  }
+}
+
+template <>
+void llvm::GenericUniformityAnalysisImpl<SSAContext>::pushUsers(
+    const Value *V) {
+  for (const auto *User : V->users()) {
+    const auto *UserInstr = dyn_cast<const Instruction>(User);
+    if (!UserInstr)
+      continue;
+    if (isAlwaysUniform(*UserInstr))
+      continue;
+    if (markDivergent(*UserInstr)) {
+      Worklist.push_back(UserInstr);
+    }
+  }
+}
+
+template <>
+void llvm::GenericUniformityAnalysisImpl<SSAContext>::pushUsers(
+    const Instruction &Instr) {
+  assert(!isAlwaysUniform(Instr));
+  if (Instr.isTerminator())
+    return;
+  pushUsers(cast<Value>(&Instr));
+}
+
+template <>
+bool llvm::GenericUniformityAnalysisImpl<SSAContext>::usesValueFromCycle(
+    const Instruction &I, const Cycle &DefCycle) const {
+  if (isAlwaysUniform(I))
+    return false;
+  for (const Use &U : I.operands()) {
+    if (auto *I = dyn_cast<Instruction>(&U)) {
+      if (DefCycle.contains(I->getParent()))
+        return true;
+    }
+  }
+  return false;
+}
+
+// This ensures explicit instantiation of
+// GenericUniformityAnalysisImpl::ImplDeleter::operator()
+template class llvm::GenericUniformityInfo<SSAContext>;
+
+//===----------------------------------------------------------------------===//
+//  UniformityInfoAnalysis and related pass implementations
+//===----------------------------------------------------------------------===//
+
+llvm::UniformityInfo UniformityInfoAnalysis::run(Function &F,
+                                                 FunctionAnalysisManager &FAM) {
+  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+  auto &CI = FAM.getResult<CycleAnalysis>(F);
+  return UniformityInfo{F, DT, CI, &TTI};
+}
+
+AnalysisKey UniformityInfoAnalysis::Key;
+
+UniformityInfoPrinterPass::UniformityInfoPrinterPass(raw_ostream &OS)
+    : OS(OS) {}
+
+PreservedAnalyses UniformityInfoPrinterPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  OS << "UniformityInfo for function '" << F.getName() << "':\n";
+  AM.getResult<UniformityInfoAnalysis>(F).print(OS);
+
+  return PreservedAnalyses::all();
+}
+
+//===----------------------------------------------------------------------===//
+//  UniformityInfoWrapperPass Implementation
+//===----------------------------------------------------------------------===//
+
+char UniformityInfoWrapperPass::ID = 0;
+
+UniformityInfoWrapperPass::UniformityInfoWrapperPass() : FunctionPass(ID) {
+  initializeUniformityInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(UniformityInfoWrapperPass, "uniforminfo",
+                      "Uniform Info Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(UniformityInfoWrapperPass, "uniforminfo",
+                    "Uniform Info Analysis", true, true)
+
+void UniformityInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<CycleInfoWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+bool UniformityInfoWrapperPass::runOnFunction(Function &F) {
+  auto &cycleInfo = getAnalysis<CycleInfoWrapperPass>().getResult();
+  auto &domTree = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &targetTransformInfo =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  m_function = &F;
+  m_uniformityInfo =
+      UniformityInfo{F, domTree, cycleInfo, &targetTransformInfo};
+  return false;
+}
+
+void UniformityInfoWrapperPass::print(raw_ostream &OS, const Module *) const {
+  OS << "UniformityInfo for function '" << m_function->getName() << "':\n";
+}
+
+void UniformityInfoWrapperPass::releaseMemory() {
+  m_uniformityInfo = UniformityInfo{};
+  m_function = nullptr;
+}

diff  --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index f4165707b3f4e..760d5909875ad 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -142,6 +142,7 @@ add_llvm_component_library(LLVMCodeGen
   MachineSSAUpdater.cpp
   MachineStripDebug.cpp
   MachineTraceMetrics.cpp
+  MachineUniformityAnalysis.cpp
   MachineVerifier.cpp
   MIRFSDiscriminator.cpp
   MIRSampleProfile.cpp

diff  --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index ae9e65f8eef3f..32709091370f1 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -92,6 +92,8 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineRegionInfoPassPass(Registry);
   initializeMachineSchedulerPass(Registry);
   initializeMachineSinkingPass(Registry);
+  initializeMachineUniformityAnalysisPassPass(Registry);
+  initializeMachineUniformityInfoPrinterPassPass(Registry);
   initializeMachineVerifierPassPass(Registry);
   initializeObjCARCContractLegacyPassPass(Registry);
   initializeOptimizePHIsPass(Registry);

diff  --git a/llvm/lib/CodeGen/MachineCycleAnalysis.cpp b/llvm/lib/CodeGen/MachineCycleAnalysis.cpp
index 6871ac35b300f..8a6030c488937 100644
--- a/llvm/lib/CodeGen/MachineCycleAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineCycleAnalysis.cpp
@@ -9,8 +9,10 @@
 #include "llvm/CodeGen/MachineCycleAnalysis.h"
 #include "llvm/ADT/GenericCycleImpl.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAContext.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
 
 using namespace llvm;
 

diff  --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp
index 01cea85ecc7c2..6de8f8da92547 100644
--- a/llvm/lib/CodeGen/MachineSSAContext.cpp
+++ b/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -21,20 +21,52 @@
 
 using namespace llvm;
 
-MachineBasicBlock *MachineSSAContext::getEntryBlock(MachineFunction &F) {
-  return &F.front();
-}
+const Register MachineSSAContext::ValueRefNull{};
 
 void MachineSSAContext::setFunction(MachineFunction &Fn) {
   MF = &Fn;
   RegInfo = &MF->getRegInfo();
 }
 
-Printable MachineSSAContext::print(MachineBasicBlock *Block) const {
+MachineBasicBlock *MachineSSAContext::getEntryBlock(MachineFunction &F) {
+  return &F.front();
+}
+
+void MachineSSAContext::appendBlockTerms(
+    SmallVectorImpl<const MachineInstr *> &terms,
+    const MachineBasicBlock &block) {
+  for (auto &T : block.terminators())
+    terms.push_back(&T);
+}
+
+void MachineSSAContext::appendBlockDefs(SmallVectorImpl<Register> &defs,
+                                        const MachineBasicBlock &block) {
+  for (const MachineInstr &instr : block.instrs()) {
+    for (const MachineOperand &op : instr.operands()) {
+      if (op.isReg() && op.isDef())
+        defs.push_back(op.getReg());
+    }
+  }
+}
+
+/// Get the defining block of a value.
+MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const {
+  if (!value)
+    return nullptr;
+  return RegInfo->getVRegDef(value)->getParent();
+}
+
+bool MachineSSAContext::isConstantValuePhi(const MachineInstr &Phi) {
+  return Phi.isConstantValuePHI();
+}
+
+Printable MachineSSAContext::print(const MachineBasicBlock *Block) const {
+  if (!Block)
+    return Printable([](raw_ostream &Out) { Out << "<nullptr>"; });
   return Printable([Block](raw_ostream &Out) { Block->printName(Out); });
 }
 
-Printable MachineSSAContext::print(MachineInstr *I) const {
+Printable MachineSSAContext::print(const MachineInstr *I) const {
   return Printable([I](raw_ostream &Out) { I->print(Out); });
 }
 

diff  --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
new file mode 100644
index 0000000000000..a3c4992728d62
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -0,0 +1,222 @@
+//===- MachineUniformityAnalysis.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineUniformityAnalysis.h"
+#include "llvm/ADT/GenericUniformityImpl.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAContext.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+template <>
+bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::hasDivergentDefs(
+    const MachineInstr &I) const {
+  for (auto &op : I.operands()) {
+    if (!op.isReg() || !op.isDef())
+      continue;
+    if (isDivergent(op.getReg()))
+      return true;
+  }
+  return false;
+}
+
+template <>
+bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::markDefsDivergent(
+    const MachineInstr &Instr, bool AllDefsDivergent) {
+  bool insertedDivergent = false;
+  const auto &MRI = F.getRegInfo();
+  const auto &TRI = *MRI.getTargetRegisterInfo();
+  for (auto &op : Instr.operands()) {
+    if (!op.isReg() || !op.isDef())
+      continue;
+    if (!op.getReg().isVirtual())
+      continue;
+    assert(!op.getSubReg());
+    if (!AllDefsDivergent) {
+      auto *RC = MRI.getRegClassOrNull(op.getReg());
+      if (RC && !TRI.isDivergentRegClass(RC))
+        continue;
+    }
+    insertedDivergent |= markDivergent(op.getReg());
+  }
+  return insertedDivergent;
+}
+
+template <>
+void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
+  const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
+
+  for (const MachineBasicBlock &block : F) {
+    for (const MachineInstr &instr : block) {
+      auto uniformity = InstrInfo.getInstructionUniformity(instr);
+      if (uniformity == InstructionUniformity::AlwaysUniform) {
+        addUniformOverride(instr);
+        continue;
+      }
+
+      if (uniformity == InstructionUniformity::NeverUniform) {
+        markDefsDivergent(instr, /* AllDefsDivergent = */ false);
+      }
+    }
+  }
+}
+
+template <>
+void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::pushUsers(
+    Register Reg) {
+  const auto &RegInfo = F.getRegInfo();
+  for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
+    if (isAlwaysUniform(UserInstr))
+      continue;
+    if (markDivergent(UserInstr))
+      Worklist.push_back(&UserInstr);
+  }
+}
+
+template <>
+void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::pushUsers(
+    const MachineInstr &Instr) {
+  assert(!isAlwaysUniform(Instr));
+  if (Instr.isTerminator())
+    return;
+  for (const MachineOperand &op : Instr.operands()) {
+    if (op.isReg() && op.isDef() && op.getReg().isVirtual())
+      pushUsers(op.getReg());
+  }
+}
+
+template <>
+bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::usesValueFromCycle(
+    const MachineInstr &I, const MachineCycle &DefCycle) const {
+  assert(!isAlwaysUniform(I));
+  for (auto &Op : I.operands()) {
+    if (!Op.isReg() || !Op.readsReg())
+      continue;
+    auto Reg = Op.getReg();
+    assert(Reg.isVirtual());
+    auto *Def = F.getRegInfo().getVRegDef(Reg);
+    if (DefCycle.contains(Def->getParent()))
+      return true;
+  }
+  return false;
+}
+
+// This ensures explicit instantiation of
+// GenericUniformityAnalysisImpl::ImplDeleter::operator()
+template class llvm::GenericUniformityInfo<MachineSSAContext>;
+
+MachineUniformityInfo
+llvm::computeMachineUniformityInfo(MachineFunction &F,
+                                   const MachineCycleInfo &cycleInfo,
+                                   const MachineDomTree &domTree) {
+  auto &MRI = F.getRegInfo();
+  assert(MRI.isSSA() && "Expected to be run on SSA form!");
+  return MachineUniformityInfo(F, domTree, cycleInfo);
+}
+
+namespace {
+
+/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
+class MachineUniformityAnalysisPass : public MachineFunctionPass {
+  MachineUniformityInfo UI;
+
+public:
+  static char ID;
+
+  MachineUniformityAnalysisPass();
+
+  MachineUniformityInfo &getUniformityInfo() { return UI; }
+  const MachineUniformityInfo &getUniformityInfo() const { return UI; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+
+  // TODO: verify analysis
+};
+
+class MachineUniformityInfoPrinterPass : public MachineFunctionPass {
+public:
+  static char ID;
+
+  MachineUniformityInfoPrinterPass();
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+} // namespace
+
+char MachineUniformityAnalysisPass::ID = 0;
+
+MachineUniformityAnalysisPass::MachineUniformityAnalysisPass()
+    : MachineFunctionPass(ID) {
+  initializeMachineUniformityAnalysisPassPass(*PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(MachineUniformityAnalysisPass, "machine-uniformity",
+                      "Machine Uniformity Info Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity",
+                    "Machine Uniformity Info Analysis", true, true)
+
+void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineCycleInfoWrapperPass>();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineUniformityAnalysisPass::runOnMachineFunction(MachineFunction &MF) {
+  auto &DomTree = getAnalysis<MachineDominatorTree>().getBase();
+  auto &CI = getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
+  UI = computeMachineUniformityInfo(MF, CI, DomTree);
+  return false;
+}
+
+void MachineUniformityAnalysisPass::print(raw_ostream &OS,
+                                          const Module *) const {
+  OS << "MachineUniformityInfo for function: " << UI.getFunction().getName()
+     << "\n";
+  UI.print(OS);
+}
+
+char MachineUniformityInfoPrinterPass::ID = 0;
+
+MachineUniformityInfoPrinterPass::MachineUniformityInfoPrinterPass()
+    : MachineFunctionPass(ID) {
+  initializeMachineUniformityInfoPrinterPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(MachineUniformityInfoPrinterPass,
+                      "print-machine-uniformity",
+                      "Print Machine Uniformity Info Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
+INITIALIZE_PASS_END(MachineUniformityInfoPrinterPass,
+                    "print-machine-uniformity",
+                    "Print Machine Uniformity Info Analysis", true, true)
+
+void MachineUniformityInfoPrinterPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineUniformityAnalysisPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineUniformityInfoPrinterPass::runOnMachineFunction(
+    MachineFunction &F) {
+  auto &UI = getAnalysis<MachineUniformityAnalysisPass>();
+  UI.print(errs());
+  return false;
+}

diff  --git a/llvm/lib/IR/SSAContext.cpp b/llvm/lib/IR/SSAContext.cpp
index 5b865692dd7fe..e758a3fbeac9f 100644
--- a/llvm/lib/IR/SSAContext.cpp
+++ b/llvm/lib/IR/SSAContext.cpp
@@ -13,30 +13,85 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/SSAContext.h"
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/IR/ModuleSlotTracker.h"
-#include "llvm/IR/Value.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
+Value *SSAContext::ValueRefNull = nullptr;
+
+void SSAContext::setFunction(Function &Fn) { F = &Fn; }
+
 BasicBlock *SSAContext::getEntryBlock(Function &F) {
   return &F.getEntryBlock();
 }
 
-void SSAContext::setFunction(Function &Fn) { F = &Fn; }
+const BasicBlock *SSAContext::getEntryBlock(const Function &F) {
+  return &F.getEntryBlock();
+}
+
+void SSAContext::appendBlockDefs(SmallVectorImpl<Value *> &defs,
+                                 BasicBlock &block) {
+  for (auto &instr : block.instructionsWithoutDebug(/*SkipPseudoOp=*/true)) {
+    if (instr.isTerminator())
+      break;
+    if (instr.getType()->isVoidTy())
+      continue;
+    auto *def = &instr;
+    defs.push_back(def);
+  }
+}
+
+void SSAContext::appendBlockDefs(SmallVectorImpl<const Value *> &defs,
+                                 const BasicBlock &block) {
+  for (auto &instr : block) {
+    if (instr.isTerminator())
+      break;
+    defs.push_back(&instr);
+  }
+}
+
+void SSAContext::appendBlockTerms(SmallVectorImpl<Instruction *> &terms,
+                                  BasicBlock &block) {
+  terms.push_back(block.getTerminator());
+}
+
+void SSAContext::appendBlockTerms(SmallVectorImpl<const Instruction *> &terms,
+                                  const BasicBlock &block) {
+  terms.push_back(block.getTerminator());
+}
+
+const BasicBlock *SSAContext::getDefBlock(const Value *value) const {
+  if (const auto *instruction = dyn_cast<Instruction>(value))
+    return instruction->getParent();
+  return nullptr;
+}
+
+bool SSAContext::comesBefore(const Instruction *lhs, const Instruction *rhs) {
+  return lhs->comesBefore(rhs);
+}
+
+bool SSAContext::isConstantValuePhi(const Instruction &Instr) {
+  if (auto *Phi = dyn_cast<PHINode>(&Instr))
+    return Phi->hasConstantValue();
+  return false;
+}
 
-Printable SSAContext::print(Value *V) const {
+Printable SSAContext::print(const Value *V) const {
   return Printable([V](raw_ostream &Out) { V->print(Out); });
 }
 
-Printable SSAContext::print(Instruction *Inst) const {
+Printable SSAContext::print(const Instruction *Inst) const {
   return print(cast<Value>(Inst));
 }
 
-Printable SSAContext::print(BasicBlock *BB) const {
+Printable SSAContext::print(const BasicBlock *BB) const {
+  if (!BB)
+    return Printable([](raw_ostream &Out) { Out << "<nullptr>"; });
   if (BB->hasName())
     return Printable([BB](raw_ostream &Out) { Out << BB->getName(); });
 

diff  --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 745c8c981113c..9093b65bbc064 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -71,6 +71,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/PassManager.h"

diff  --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index bbf7b6d0542df..ee4108443cf00 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -235,6 +235,7 @@ FUNCTION_ANALYSIS("targetir",
 FUNCTION_ANALYSIS("verify", VerifierAnalysis())
 FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 FUNCTION_ANALYSIS("divergence", DivergenceAnalysis())
+FUNCTION_ANALYSIS("uniformity", UniformityInfoAnalysis())
 
 #ifndef FUNCTION_ALIAS_ANALYSIS
 #define FUNCTION_ALIAS_ANALYSIS(NAME, CREATE_PASS)                             \
@@ -362,6 +363,7 @@ FUNCTION_PASS("print-cfg-sccs", CFGSCCPrinterPass(dbgs()))
 FUNCTION_PASS("print-predicateinfo", PredicateInfoPrinterPass(dbgs()))
 FUNCTION_PASS("print-mustexecute", MustExecutePrinterPass(dbgs()))
 FUNCTION_PASS("print-memderefs", MemDerefPrinterPass(dbgs()))
+FUNCTION_PASS("print<uniformity>", UniformityInfoPrinterPass(dbgs()))
 FUNCTION_PASS("reassociate", ReassociatePass())
 FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass())
 FUNCTION_PASS("reg2mem", RegToMemPass())

diff  --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 982b0e1e45b84..10cdad30913d4 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -130,6 +130,13 @@ enum : uint64_t {
 
   // Is a WMMA instruction.
   IsWMMA = UINT64_C(1) << 59,
+
+  // Is source of divergence.
+  //
+  // Note: There is no corresponding SIInstrInfo::IsSourceOfDivergence method
+  // by design, since this flag only covers opcodes that are _always_ divergent.
+  // Use SIInstrInfo::getInstructionUniformity for a more complete analysis.
+  IsSourceOfDivergence = UINT64_C(1) << 60
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 430284f1e5289..60077111d89a3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3156,9 +3156,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
         return false;
 
       unsigned NewOpc =
-          IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
-                         : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
-                                                  : AMDGPU::V_FMAMK_F16)
+          IsFMA ? (IsF32                    ? AMDGPU::V_FMAMK_F32
+                   : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
+                                            : AMDGPU::V_FMAMK_F16)
                 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
       if (pseudoToMCOpcode(NewOpc) == -1)
         return false;
@@ -3236,9 +3236,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       }
 
       unsigned NewOpc =
-          IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
-                         : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
-                                                  : AMDGPU::V_FMAAK_F16)
+          IsFMA ? (IsF32                    ? AMDGPU::V_FMAAK_F32
+                   : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
+                                            : AMDGPU::V_FMAAK_F16)
                 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
       if (pseudoToMCOpcode(NewOpc) == -1)
         return false;
@@ -8395,6 +8395,125 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   return SchedModel.computeInstrLatency(&MI);
 }
 
+InstructionUniformity
+SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
+  unsigned opcode = MI.getOpcode();
+  if (opcode == AMDGPU::G_INTRINSIC ||
+      opcode == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS) {
+    return AMDGPU::isIntrinsicSourceOfDivergence(MI.getIntrinsicID())
+               ? InstructionUniformity::NeverUniform
+               : InstructionUniformity::AlwaysUniform;
+  }
+
+  // Loads from the private and flat address spaces are divergent, because
+  // threads can execute the load instruction with the same inputs and get
+  // 
diff erent results.
+  //
+  // All other loads are not divergent, because if threads issue loads with the
+  // same arguments, they will always get the same result.
+  if (opcode == AMDGPU::G_LOAD) {
+    if (MI.memoperands_empty())
+      return InstructionUniformity::NeverUniform; // conservative assumption
+
+    if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
+          return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
+                 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
+        })) {
+      // At least one MMO in a non-global address space.
+      return InstructionUniformity::NeverUniform;
+    }
+    return InstructionUniformity::Default;
+  }
+
+  if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
+      opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
+      opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
+    return InstructionUniformity::NeverUniform;
+  }
+  return InstructionUniformity::Default;
+}
+
+InstructionUniformity
+SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
+  if (MI.getDesc().TSFlags & SIInstrFlags::IsSourceOfDivergence)
+    return InstructionUniformity::NeverUniform;
+
+  // Atomics are divergent because they are executed sequentially: when an
+  // atomic operation refers to the same address in each thread, then each
+  // thread after the first sees the value written by the previous thread as
+  // original value.
+
+  if (isAtomic(MI))
+    return InstructionUniformity::NeverUniform;
+
+  // Loads from the private and flat address spaces are divergent, because
+  // threads can execute the load instruction with the same inputs and get
+  // 
diff erent results.
+  if (isFLAT(MI) && MI.mayLoad()) {
+    if (MI.memoperands_empty())
+      return InstructionUniformity::NeverUniform; // conservative assumption
+
+    if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
+          return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
+                 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
+        })) {
+      // At least one MMO in a non-global address space.
+      return InstructionUniformity::NeverUniform;
+    }
+
+    return InstructionUniformity::Default;
+  }
+
+  unsigned opcode = MI.getOpcode();
+  if (opcode == AMDGPU::COPY) {
+    const MachineOperand &srcOp = MI.getOperand(1);
+    if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
+      const TargetRegisterClass *regClass = RI.getPhysRegClass(srcOp.getReg());
+      return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
+                                      : InstructionUniformity::NeverUniform;
+    }
+    return InstructionUniformity::Default;
+  }
+  if (opcode == AMDGPU::INLINEASM || opcode == AMDGPU::INLINEASM_BR) {
+    const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+    for (auto &op : MI.operands()) {
+      if (!op.isReg() || !op.isDef())
+        continue;
+      auto *RC = MRI.getRegClass(op.getReg());
+      if (!RC || RI.isDivergentRegClass(RC))
+        return InstructionUniformity::NeverUniform;
+    }
+    return InstructionUniformity::AlwaysUniform;
+  }
+  if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32)
+    return InstructionUniformity::AlwaysUniform;
+
+  if (opcode == AMDGPU::V_WRITELANE_B32)
+    return InstructionUniformity::NeverUniform;
+
+  // GMIR handling
+  if (SIInstrInfo::isGenericOpcode(opcode))
+    return SIInstrInfo::getGenericInstructionUniformity(MI);
+
+  // Handling $vpgr reads
+  for (auto srcOp : MI.operands()) {
+    if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
+      const TargetRegisterClass *regClass = RI.getPhysRegClass(srcOp.getReg());
+      if (RI.isVGPRClass(regClass))
+        return InstructionUniformity::NeverUniform;
+    }
+  }
+
+  // TODO: Uniformity check condtions above can be rearranged for more
+  // redability
+
+  // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
+  //       currently turned into no-op COPYs by SelectionDAG ISel and are
+  //       therefore no longer recognizable.
+
+  return InstructionUniformity::Default;
+}
+
 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
   switch (MF.getFunction().getCallingConv()) {
   case CallingConv::AMDGPU_PS:

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index d6e53bf3476f4..bc0f44d6ac0fa 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1166,6 +1166,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                            const MachineInstr &MI,
                            unsigned *PredCost = nullptr) const override;
 
+  InstructionUniformity
+  getInstructionUniformity(const MachineInstr &MI) const override final;
+
+  InstructionUniformity
+  getGenericInstructionUniformity(const MachineInstr &MI) const;
+
   const MIRFormatter *getMIRFormatter() const override {
     if (!Formatter.get())
       Formatter = std::make_unique<AMDGPUMIRFormatter>();

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/always-uniform-gmir.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/always-uniform-gmir.mir
new file mode 100644
index 0000000000000..5ca5fd13cb233
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/always-uniform-gmir.mir
@@ -0,0 +1,130 @@
+# NOTE: This file is Generic MIR translation of test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll test file
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+---
+name:            readfirstlane
+body:             |
+  bb.1:
+    ; CHECK-LABEL: MachineUniformityInfo for function: readfirstlane
+    ; CHECK: DIVERGENT: %{{[0-9]+}}
+    ; CHECK-SAME:llvm.amdgcn.workitem.id.x
+    ; CHECK-NOT: DIVERGENT: {{.*}}llvm.amdgcn.readfirstlane
+    %6:_(p1) = G_IMPLICIT_DEF
+    %4:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %5:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %4(s32)
+    G_STORE %5(s32), %6(p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            icmp
+body:             |
+  bb.1:
+    liveins: $sgpr4_sgpr5
+    ; CHECK-LABEL: MachineUniformityInfo for function: icmp
+    ; CHECK-NEXT: ALL VALUES UNIFORM
+
+    %3:_(p4) = COPY $sgpr4_sgpr5
+    %13:_(s32) = G_CONSTANT i32 0
+    %7:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
+    %8:_(s32) = G_LOAD %7(p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    %9:_(s64) = G_CONSTANT i64 8
+    %10:_(p4) = G_PTR_ADD %7, %9(s64)
+    %11:_(p1) = G_LOAD %10(p4) :: (dereferenceable invariant load (p1), addrspace 4)
+    %12:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.icmp), %8(s32), %13(s32), 33
+    G_STORE %12(s64), %11(p1) :: (volatile store (s64) , addrspace 1)
+    S_ENDPGM 0
+
+...
+---
+name:            fcmp
+body:             |
+  bb.1:
+    liveins: $sgpr4_sgpr5
+    ; CHECK-LABEL: MachineUniformityInfo for function: fcmp
+    ; CHECK-NEXT: ALL VALUES UNIFORM
+
+    %3:_(p4) = COPY $sgpr4_sgpr5
+    %10:_(s32) = G_CONSTANT i32 0
+    %12:_(s32) = G_CONSTANT i32 1
+    %16:_(p1) = G_IMPLICIT_DEF
+    %7:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
+    %8:_(<2 x s32>) = G_LOAD %7(p4) :: (dereferenceable invariant load (<2 x s32>) , align 16, addrspace 4)
+    %9:_(s32) = G_EXTRACT_VECTOR_ELT %8(<2 x s32>), %10(s32)
+    %11:_(s32) = G_EXTRACT_VECTOR_ELT %8(<2 x s32>), %12(s32)
+    %13:_(s64) = G_CONSTANT i64 4
+    %14:_(p4) = G_PTR_ADD %7, %13(s64)
+    %15:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fcmp), %9(s32), %11(s32), 33
+    G_STORE %15(s64), %16(p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
+    S_ENDPGM 0
+
+...
+---
+name:            ballot
+body:             |
+  bb.1:
+    liveins: $sgpr4_sgpr5
+    ; CHECK-LABEL: MachineUniformityInfo for function: ballot
+    ; CHECK-NEXT: ALL VALUES UNIFORM
+
+    %2:_(p4) = COPY $sgpr4_sgpr5
+    %10:_(p1) = G_IMPLICIT_DEF
+    %6:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
+    %7:_(s32) = G_LOAD %6(p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    %8:_(s1) = G_TRUNC %7(s32)
+    %9:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ballot), %8(s1)
+    G_STORE %9(s64), %10(p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
+    S_ENDPGM 0
+
+...
+---
+name:            asm_sgpr
+registers:
+  - { id: 0, class: _, preferred-register: '' }
+  - { id: 1, class: sreg_32, preferred-register: '' }
+  - { id: 2, class: vgpr_32, preferred-register: '' }
+  - { id: 3, class: _, preferred-register: '' }
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: MachineUniformityInfo for function: asm_sgpr
+    ; CHECK-NOT: DIVERGENT: %1
+
+    %0:_(s32) = COPY $vgpr0
+    %2:vgpr_32 = COPY %0(s32)
+    INLINEASM &"; def $0, $1", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1, 1835017 /* reguse:VGPR_32 */, %2
+    %3:_(s32) = COPY %1
+    $vgpr0 = COPY %3(s32)
+    SI_RETURN implicit $vgpr0
+
+...
+
+# FIXME :: BELOW INLINE ASM SHOULD BE DIVERGENT
+---
+name:            asm_mixed_sgpr_vgpr
+registers:
+  - { id: 0, class: _, preferred-register: '' }
+  - { id: 1, class: sreg_32, preferred-register: '' }
+  - { id: 2, class: vgpr_32, preferred-register: '' }
+  - { id: 3, class: vgpr_32, preferred-register: '' }
+  - { id: 4, class: _, preferred-register: '' }
+  - { id: 5, class: _, preferred-register: '' }
+  - { id: 6, class: _, preferred-register: '' }
+liveins:         []
+frameInfo:
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: MachineUniformityInfo for function: asm_mixed_sgpr_vgpr
+    ; CHECK: DIVERGENT: %0:
+    ; CHECK: DIVERGENT: %3:
+    ; CHECK: DIVERGENT: %2:
+    ; CHECK: DIVERGENT: %5:
+    %0:_(s32) = COPY $vgpr0
+    %6:_(p1) = G_IMPLICIT_DEF
+    %3:vgpr_32 = COPY %0(s32)
+    INLINEASM &"; def $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1, 1835018 /* regdef:VGPR_32 */, def %2, 1835017 /* reguse:VGPR_32 */, %3
+    %4:_(s32) = COPY %1
+    %5:_(s32) = COPY %2
+    G_STORE %5(s32), %6(p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+    SI_RETURN
+
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/always-uniform.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/always-uniform.mir
new file mode 100644
index 0000000000000..35a5a3f110e9c
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/always-uniform.mir
@@ -0,0 +1,55 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+# readlane, readfirstlane is always uniform
+
+---
+name:            readlane
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: readlane
+    ; CHECK-NEXT: ALL VALUES UNIFORM
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
+    %3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
+    $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    $sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
+    S_ENDPGM 0
+...
+
+# Readlane with physical register as operand
+---
+name:            readlane2
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: readlane2
+    ; CHECK-NEXT: ALL VALUES UNIFORM
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %4:sgpr_32 = V_READLANE_B32 $vgpr0, 0, implicit $exec
+    $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    $sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
+    %5:sgpr_32 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+    S_ENDPGM 0
+...
+
+
+
+# for copy operand src = sgpr -> uniform
+---
+name:            sgprcopy
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: sgprcopy
+    ; CHECK-NEXT: ALL VALUES UNIFORM
+    liveins: $sgpr0,$sgpr1,$vgpr0
+    %0:sgpr_32 = COPY $sgpr0
+    %1:vgpr_32 = COPY $sgpr1
+    S_ENDPGM 0
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/atomics-gmir.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/atomics-gmir.mir
new file mode 100644
index 0000000000000..3e0236410e65e
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/atomics-gmir.mir
@@ -0,0 +1,109 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+
+---
+name:            test1
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    %2:_(s32) = IMPLICIT_DEF
+    %3:_(s32) = IMPLICIT_DEF
+    %0:_(p0) = G_MERGE_VALUES %2(s32), %3(s32)
+    %1:_(s32) = IMPLICIT_DEF
+
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_ATOMICRMW_XCHG
+    %4:_(s32) = G_ATOMICRMW_XCHG %0(p0), %1 :: (load store seq_cst (s32))
+
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_ATOMIC_CMPXCHG_WITH_SUCCESS
+    %5:_(s32), %6:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS %0(p0), %1, %2 :: (load store seq_cst seq_cst (s32) )
+    $vgpr0 = COPY %4(s32)
+    SI_RETURN implicit $vgpr0
+...
+
+---
+name:            test_atomic_inc_dec
+tracksRegLiveness: true
+body:             |
+  bb.1:
+
+    %2:_(s32) = IMPLICIT_DEF
+    %3:_(s32) = IMPLICIT_DEF
+    %0:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %1:_(s32) = IMPLICIT_DEF
+    %5:_(s64) = IMPLICIT_DEF
+
+    ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.inc)
+    %4:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.inc), %0(p1), %1(s32), 0, 0, 0 :: (load store (s32) )
+
+    ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.inc)
+    %6:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.inc), %0(p1), %5(s64), 0, 0, 0 :: (load store (s64) )
+
+    ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.dec)
+    %7:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.dec), %0(p1), %1(s32), 0, 0, 0 :: (load store (s32) )
+
+    ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.dec)
+    %8:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.dec), %0(p1), %5(s64), 0, 0, 0 :: (load store (s64) )
+    $vgpr0 = COPY %4(s32)
+    SI_RETURN implicit $vgpr0
+
+...
+
+---
+name:            test_atomics
+tracksRegLiveness: true
+body:             |
+  bb.1:
+
+    %2:_(s32) = IMPLICIT_DEF
+    %3:_(s32) = IMPLICIT_DEF
+    %0:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %1:_(s32) = IMPLICIT_DEF
+    %5:_(s32) = IMPLICIT_DEF
+
+    ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_ATOMICRMW_ADD
+    %4:_(s32) = G_ATOMICRMW_ADD %2, %3
+
+    ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_ATOMICRMW_SUB
+    %6:_(s32) = G_ATOMICRMW_SUB %1, %5
+
+    ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_ATOMICRMW_AND
+    %7:_(s32) = G_ATOMICRMW_AND %2, %3
+
+    ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_ATOMICRMW_NAND
+    %8:_(s32) = G_ATOMICRMW_NAND %1, %5
+
+    ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_ATOMICRMW_OR
+    %9:_(s32) = G_ATOMICRMW_OR %2, %3
+
+    ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_XOR
+    %10:_(s32) = G_ATOMICRMW_XOR %1, %5
+
+    ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_MAX
+    %11:_(s32) = G_ATOMICRMW_MAX %2, %3
+
+    ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_MIN
+    %12:_(s32) = G_ATOMICRMW_MIN %1, %5
+
+    ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_UMAX
+    %13:_(s32) = G_ATOMICRMW_UMAX %2, %3
+
+    ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_UMIN
+    %14:_(s32) = G_ATOMICRMW_UMIN %1, %5
+
+    ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FADD
+    %15:_(s32) = G_ATOMICRMW_FADD %2, %3
+
+    ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FSUB
+    %16:_(s32) = G_ATOMICRMW_FSUB %1, %5
+
+    ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FMAX
+    %17:_(s32) = G_ATOMICRMW_FMAX %2, %3
+
+    ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FMIN
+    %18:_(s32) = G_ATOMICRMW_FMIN %1, %5
+
+    $vgpr0 = COPY %4(s32)
+    SI_RETURN implicit $vgpr0
+
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/atomics.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/atomics.mir
new file mode 100644
index 0000000000000..b8f34d6bfefcb
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/atomics.mir
@@ -0,0 +1,132 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+
+---
+name:            test1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: test1
+    %2:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+    %5:vreg_64 = COPY %3
+    %6:vreg_64 = COPY %3
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: FLAT_ATOMIC_SWAP_RTN
+    %4:vgpr_32 = FLAT_ATOMIC_SWAP_RTN killed %5, %2, 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: FLAT_ATOMIC_SWAP_RTN
+    %7:vgpr_32 = FLAT_ATOMIC_SWAP_RTN killed %6, %2, 0, 1, implicit $exec, implicit $flat_scr ; No memopernads
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+...
+
+---
+name:            test2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: test2
+    %3:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %4:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+    %5:sreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
+    %7:vreg_64 = COPY %4
+    %8:vreg_64 = COPY %5
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: FLAT_ATOMIC_CMPSWAP_RTN
+    %6:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN killed %7, killed %8, 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst seq_cst (s32))
+    %9:sreg_64_xexec = V_CMP_EQ_U32_e64 %6, %2, implicit $exec
+    %10:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec
+    $vgpr0 = COPY %6
+    $vgpr1 = COPY %10
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name:            atomic_inc
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  ; CHECK-LABEL: MachineUniformityInfo for function: atomic_inc
+    %2:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+    %5:vreg_64 = COPY %3
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: GLOBAL_ATOMIC_INC_RTN
+    %4:vgpr_32 = GLOBAL_ATOMIC_INC_RTN killed %5, %2, 0, 1, implicit $exec :: (load store (s32), addrspace 1)
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+...
+
+---
+name:            atomic_inc_64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  ; CHECK-LABEL: MachineUniformityInfo for function: atomic_inc_64
+    %3:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %4:sreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
+    %5:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+    %7:vreg_64 = COPY %5
+    %8:vreg_64 = COPY %4
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: GLOBAL_ATOMIC_INC_X2_RTN
+    %6:vreg_64 = GLOBAL_ATOMIC_INC_X2_RTN killed %7, killed %8, 0, 1, implicit $exec :: (load store (s64), addrspace 1)
+    %9:vgpr_32 = COPY %6.sub1
+    %10:vgpr_32 = COPY %6.sub0
+    $vgpr0 = COPY %10
+    $vgpr1 = COPY %9
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name:            atomic_dec
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  ; CHECK-LABEL: MachineUniformityInfo for function: atomic_dec
+    %2:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+    %5:vreg_64 = COPY %3
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: GLOBAL_ATOMIC_DEC_RTN
+    %4:vgpr_32 = GLOBAL_ATOMIC_DEC_RTN killed %5, %2, 0, 1, implicit $exec :: (load store (s32), addrspace 1)
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+...
+
+
+---
+name:            atomic_dec_64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  ; CHECK-LABEL: MachineUniformityInfo for function: atomic_dec_64
+    %3:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %4:sreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
+    %5:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+    %7:vreg_64 = COPY %5
+    %8:vreg_64 = COPY %4
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: GLOBAL_ATOMIC_DEC_X2_RTN
+    %6:vreg_64 = GLOBAL_ATOMIC_DEC_X2_RTN killed %7, killed %8, 0, 1, implicit $exec :: (load store (s64), addrspace 1)
+    %9:vgpr_32 = COPY %6.sub1
+    %10:vgpr_32 = COPY %6.sub0
+    $vgpr0 = COPY %10
+    $vgpr1 = COPY %9
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
new file mode 100644
index 0000000000000..779017eafe59e
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
@@ -0,0 +1,79 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+# CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge
+# CHECK-LABEL: BLOCK bb.0
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1
+# CHECK: DIVERGENT: G_BR %bb.2
+# CHECK-LABEL: BLOCK bb.1
+# CHECK-LABEL: BLOCK bb.2
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.1, %{{[0-9]*}}:_(s32), %bb.0
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_PHI %{{[0-9]*}}:_(s1), %bb.1, %{{[0-9]*}}:_(s1), %bb.0
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.3
+# CHECK: DIVERGENT: G_BR %bb.4
+# CHECK-LABEL: BLOCK bb.3
+# CHECK-LABEL: BLOCK bb.4
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.2, %{{[0-9]*}}:_(s32), %bb.3
+
+---
+name:            hidden_diverge
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+    liveins: $sgpr4_sgpr5
+
+    %4:_(p4) = COPY $sgpr4_sgpr5
+    %15:_(s32) = G_CONSTANT i32 0
+    %17:_(s1) = G_CONSTANT i1 true
+    %23:_(s32) = G_CONSTANT i32 1
+    %30:_(s32) = G_CONSTANT i32 2
+    %32:_(p1) = G_IMPLICIT_DEF
+    %33:_(s32) = G_IMPLICIT_DEF
+    %8:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
+    %9:_(<3 x s32>) = G_LOAD %8(p4) :: (dereferenceable invariant load (<3 x s32>), align 16, addrspace 4)
+    %10:_(s64) = G_CONSTANT i64 4
+    %11:_(p4) = G_PTR_ADD %8, %10(s64)
+    %12:_(s64) = G_CONSTANT i64 8
+    %13:_(p4) = G_PTR_ADD %8, %12(s64)
+    %14:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %16:_(s1) = G_ICMP intpred(slt), %14(s32), %15
+    %18:_(s1) = G_XOR %16, %17
+    %19:_(s1), %20:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %16(s1)
+    G_BRCOND %19(s1), %bb.2
+    G_BR %bb.3
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+
+    %21:_(s32) = G_EXTRACT_VECTOR_ELT %9(<3 x s32>), %15(s32)
+    %22:_(s32) = G_EXTRACT_VECTOR_ELT %9(<3 x s32>), %23(s32)
+    %24:_(s1) = G_ICMP intpred(slt), %21(s32), %15
+
+  bb.3:
+    successors: %bb.4(0x40000000), %bb.5(0x40000000)
+
+    %25:_(s32) = G_PHI %22(s32), %bb.2, %33(s32), %bb.1
+    %26:_(s1) = G_PHI %24(s1), %bb.2, %18(s1), %bb.1
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %20(s64)
+    %27:_(s1), %28:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %26(s1)
+    G_BRCOND %27(s1), %bb.4
+    G_BR %bb.5
+
+  bb.4:
+    successors: %bb.5(0x80000000)
+
+    %29:_(s32) = G_EXTRACT_VECTOR_ELT %9(<3 x s32>), %30(s32)
+
+  bb.5:
+    %31:_(s32) = G_PHI %25(s32), %bb.3, %29(s32), %bb.4
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %28(s64)
+    G_STORE %31(s32), %32(p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+    S_ENDPGM 0
+
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/hidden-diverge.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/hidden-diverge.mir
new file mode 100644
index 0000000000000..5194c9d7926e1
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/hidden-diverge.mir
@@ -0,0 +1,79 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+
+---
+# CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge
+# CHECK-LABEL: BLOCK bb.0
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vgpr_32(s32) = COPY $vgpr0
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_64 = V_CMP_GT_I32_e64
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_64 = V_CMP_LT_I32_e64
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vreg_1 = COPY
+# CHECK: DIVERGENT: %{{[0-9]*}}:sreg_64 = SI_IF
+# CHECK: DIVERGENT: S_BRANCH %bb.1
+# CHECK-LABEL: BLOCK bb.2
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_32 = PHI %{{[0-9]*}}:sreg_32, %bb.0, %{{[0-9]*}}:sreg_32, %bb.1
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vreg_1 = PHI %{{[0-9]*}}:vreg_1, %bb.0, %{{[0-9]*}}:sreg_64, %bb.1
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_64 = COPY %{{[0-9]*}}:vreg_1
+# CHECK: DIVERGENT: %{{[0-9]*}}:sreg_64 = SI_IF %{{[0-9]*}}:sreg_64, %bb.4
+# CHECK: DIVERGENT: S_BRANCH %bb.3
+# CHECK-LABEL: BLOCK bb.3
+# CHECK-LABEL: BLOCK bb.4
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vgpr_32 = PHI %{{[0-9]*}}:sreg_32, %bb.2, %{{[0-9]*}}:sreg_32, %bb.3
+
+name:            hidden_diverge
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %11:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    %10:vgpr_32(s32) = COPY $vgpr0
+    %15:sreg_64_xexec = S_LOAD_DWORDX2_IMM %11(p4), 36, 0
+    %16:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %11(p4), 44, 0
+    %17:sreg_32 = COPY %15.sub1
+    %18:sreg_32 = COPY %15.sub0
+    %19:sgpr_96 = REG_SEQUENCE killed %18, %subreg.sub0, killed %17, %subreg.sub1, killed %16, %subreg.sub2
+    %0:sgpr_96 = COPY %19
+    %20:sreg_32 = S_MOV_B32 -1
+    %21:sreg_64 = V_CMP_GT_I32_e64 %10(s32), killed %20, implicit $exec
+    %22:sreg_32 = S_MOV_B32 0
+    %23:sreg_64 = V_CMP_LT_I32_e64 %10(s32), killed %22, implicit $exec
+    %1:vreg_1 = COPY %21
+    %14:sreg_32 = IMPLICIT_DEF
+    %2:sreg_64 = SI_IF killed %23, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %24:sreg_32 = COPY %0.sub0
+    %3:sreg_32 = COPY %0.sub1
+    %25:sreg_32 = S_MOV_B32 0
+    S_CMP_LT_I32 killed %24, killed %25, implicit-def $scc
+    %26:sreg_64 = COPY $scc
+    %4:sreg_64 = COPY %26
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.4(0x40000000)
+
+    %5:sreg_32 = PHI %14, %bb.0, %3, %bb.1
+    %6:vreg_1 = PHI %1, %bb.0, %4, %bb.1
+    SI_END_CF %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %27:sreg_64 = COPY %6
+    %7:sreg_64 = SI_IF %27, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.4(0x80000000)
+
+    %8:sreg_32 = COPY %0.sub2
+
+  bb.4:
+    %9:vgpr_32 = PHI %5, %bb.2, %8, %bb.3
+    SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %28:sreg_64 = IMPLICIT_DEF
+    %29:vreg_64 = COPY %28
+    GLOBAL_STORE_DWORD killed %29, %9, 0, 0, implicit $exec
+    S_ENDPGM 0
+
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/branch-outside-gmir.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/branch-outside-gmir.mir
new file mode 100644
index 0000000000000..1011f9b411f82
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/branch-outside-gmir.mir
@@ -0,0 +1,59 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+
+# CHECK-LABEL: MachineUniformityInfo for function: basic
+# CHECK-NEXT: CYCLES ASSSUMED DIVERGENT:
+# CHECK-NEXT: depth=1: entries(bb.1 bb.3) bb.2
+# CHECK-LABEL: BLOCK bb.1
+# CHECK: DIVERGENT
+# CHECK: DIVERGENT
+# CHECK-LABEL: BLOCK bb.2
+# CHECK: DIVERGENT
+# CHECK: DIVERGENT
+# CHECK-LABEL: BLOCK bb.3
+# CHECK: DIVERGENT
+# CHECK: DIVERGENT
+# CHECK-LABEL: BLOCK bb.4
+# CHECK-NOT: DIVERGENT
+
+
+---
+name:            basic
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.3, %bb.1
+
+    %0:_(s32) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(s32) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 1
+    %4:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %6:_(s1) = G_ICMP intpred(slt), %1(s32), %0(s32) ;uniform condition
+    %7:_(s1) = G_ICMP intpred(eq), %4(s32), %0 ;divergent condition
+    G_BRCOND %7(s1), %bb.3
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2
+
+    %8:_(s32) = G_PHI %0(s32), %bb.0, %2(s32), %bb.3
+    %9:_(s32) = G_ADD %8(s32), %3(s32)
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.3, %bb.4
+
+    %13:_(s32) = G_ADD %2(s32), %3(s32)
+    %10:_(s32) = G_ADD %8(s32), %3(s32)
+    G_BRCOND %6(s1), %bb.3
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.1
+    %11:_(s32) = G_PHI %13(s32), %bb.2, %0(s32), %bb.0
+    %12:_(s32) = G_ADD %11(s32), %3(s32)
+    G_BR %bb.1
+  bb.4:
+    %14:_(s32) = G_ADD %2(s32), %3(s32)
+    S_ENDPGM 0
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/diverged-entry-basic-gmir.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/diverged-entry-basic-gmir.mir
new file mode 100644
index 0000000000000..cb1fbbc28d5e1
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/diverged-entry-basic-gmir.mir
@@ -0,0 +1,118 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+# CHECK-LABEL: MachineUniformityInfo for function: divergent_cycle_1
+# CHECK-NEXT: CYCLES ASSSUMED DIVERGENT:
+# CHECK-NEXT: depth=1: entries(bb.3 bb.1) bb.4 bb.2
+# CHECK-NEXT: CYCLES WITH DIVERGENT EXIT:
+# CHECK-NEXT: depth=2: entries(bb.4 bb.1) bb.2
+# CHECK-NEXT: depth=1: entries(bb.3 bb.1) bb.4 bb.2
+
+
+
+---
+name:            divergent_cycle_1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.3
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(s32) = G_CONSTANT i32 1
+
+    %2:_(s32) = G_IMPLICIT_DEF
+    %3:_(s32) = G_IMPLICIT_DEF
+
+    %4:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %6:_(s1) = G_ICMP intpred(slt), %2(s32), %0(s32) ;uniform condition
+    %7:_(s1) = G_ICMP intpred(eq), %4(s32), %0 ;divergent condition
+    G_BRCOND %6(s1), %bb.1
+    G_BR %bb.3
+
+  bb.1:
+    successors: %bb.2
+
+  ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.0, %{{[0-9]*}}:_(s32), %bb.4
+    %8:_(s32) = G_PHI %2(s32), %bb.0, %3(s32), %bb.4
+    %9:_(s32) = G_ADD %3(s32), %1(s32)
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.3, %bb.4
+
+    %13:_(s32) = G_ADD %3(s32), %1(s32)
+    G_BRCOND %7(s1), %bb.4
+    G_BR %bb.3
+
+  bb.3:
+    successors: %bb.4
+
+    %14:_(s32) = G_ADD %3(s32), %1(s32)
+    G_BR %bb.4
+  bb.4:
+    successors: %bb.5, %bb.1
+
+  ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.2, %{{[0-9]*}}:_(s32), %bb.3
+    %15:_(s32) = G_PHI %13(s32), %bb.2, %14(s32), %bb.3
+    %16:_(s32) = G_ADD %3(s32), %1(s32)
+    G_BRCOND %6(s1), %bb.5
+    G_BR %bb.1
+
+  bb.5:
+    %17:_(s32) = G_ADD %3(s32), %1(s32)
+    S_ENDPGM 0
+...
+
+# CHECK-LABEL: MachineUniformityInfo for function: uniform_cycle_1
+---
+name:            uniform_cycle_1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.5
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(s32) = G_CONSTANT i32 1
+
+    %2:_(s32) = G_IMPLICIT_DEF
+    %3:_(s32) = G_IMPLICIT_DEF
+
+    %4:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %6:_(s1) = G_ICMP intpred(slt), %2(s32), %0(s32) ;uniform condition
+    %7:_(s1) = G_ICMP intpred(eq), %4(s32), %0 ;divergent condition
+    G_BRCOND %6(s1), %bb.1
+    G_BR %bb.5
+
+  bb.1:
+    successors: %bb.2
+
+  ; CHECK-NOT: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.0, %{{[0-9]*}}:_(s32), %bb.4
+    %8:_(s32) = G_PHI %2(s32), %bb.0, %3(s32), %bb.5
+    %9:_(s32) = G_ADD %3(s32), %1(s32)
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.3, %bb.4
+
+    %13:_(s32) = G_ADD %3(s32), %1(s32)
+    G_BRCOND %7(s1), %bb.4
+    G_BR %bb.3
+
+  bb.3:
+    successors: %bb.4
+
+    %14:_(s32) = G_ADD %3(s32), %1(s32)
+    G_BR %bb.4
+  bb.4:
+    successors: %bb.6, %bb.5
+
+  ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.2, %{{[0-9]*}}:_(s32), %bb.3
+    %15:_(s32) = G_PHI %13(s32), %bb.2, %14(s32), %bb.3
+    %16:_(s32) = G_ADD %3(s32), %1(s32)
+    G_BRCOND %6(s1), %bb.6
+    G_BR %bb.5
+
+  bb.5:
+    successors: %bb.1
+    %18:_(s32) = G_ADD %3(s32), %1(s32)
+    G_BR %bb.1
+  bb.6:
+    %17:_(s32) = G_ADD %3(s32), %1(s32)
+    S_ENDPGM 0
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/exit-divergence-gmir.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/exit-divergence-gmir.mir
new file mode 100644
index 0000000000000..b637a9d9350b9
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/exit-divergence-gmir.mir
@@ -0,0 +1,58 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+# CHECK-LABEL: MachineUniformityInfo for function: basic
+# CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+# CHECK: CYCLES WITH DIVERGENT EXIT:
+# CHECK: depth=1: entries(bb.1 bb.3) bb.2
+
+---
+name:            basic
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.3, %bb.1
+
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(s32) = G_CONSTANT i32 1
+
+    %2:_(s32) = G_IMPLICIT_DEF
+    %3:_(s32) = G_IMPLICIT_DEF
+
+    %6:_(s1) = G_ICMP intpred(slt), %2(s32), %0(s32) ;uniform condition
+    %7:_(s1) = G_ICMP intpred(eq), %4(s32), %0 ;divergent condition
+    %4:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    G_BRCOND %6(s1), %bb.3
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2
+
+    %8:_(s32) = G_PHI %2(s32), %bb.0, %3(s32), %bb.3
+    %10:_(s32) = G_PHI %2(s32), %bb.0, %16(s32), %bb.3
+    %9:_(s32) = G_ADD %3(s32), %1(s32)
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.3, %bb.4
+
+    %13:_(s32) = G_ADD %3(s32), %1(s32)
+    %14:_(s32) = G_ADD %10(s32), %1(s32)
+    %15:_(s32) = G_ADD %10(s32), %1(s32)
+
+    G_BRCOND %7(s1), %bb.3
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.1
+    %16:_(s32) = G_PHI %13(s32), %bb.2, %2(s32), %bb.0
+    %17:_(s32) = G_ADD %3(s32), %1(s32)
+    G_BR %bb.1
+  bb.4:
+    ; CHECK-LABEL: bb.4
+    ; CHECK: DIVERGENT:
+    ; CHECK: DIVERGENT:
+    ; CHECK-NOT: DIVERGENT:
+    %18:_(s32) = G_ADD %8(s32), %3(s32)
+    %19:_(s32) = G_ADD %8(s32), %3(s32)
+    %20:_(s32) = G_ADD %3(s32), %1(s32)
+    S_ENDPGM 0
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir
new file mode 100644
index 0000000000000..7a7b77320c04a
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir
@@ -0,0 +1,53 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+
+# CHECK-LABEL: MachineUniformityInfo for function: irreducible
+# CHECK: CYCLES ASSSUMED DIVERGENT:
+# CHECK:   depth=1: entries(bb.2 bb.1) bb.3 bb.5 bb.4
+# CHECK: CYCLES WITH DIVERGENT EXIT:
+# CHECK:   depth=1: entries(bb.2 bb.1) bb.3 bb.5 bb.4
+# CHECK:   depth=2: entries(bb.3 bb.1) bb.5 bb.4
+
+---
+name:            irreducible
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16
+
+    %0:sreg_32 = IMPLICIT_DEF
+    %2:vgpr_32 = COPY $vgpr0
+    %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    S_CMP_EQ_U32 %0, 0, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit $scc
+    S_BRANCH %bb.2
+
+  bb.1:
+    %28:vgpr_32 = PHI %3, %bb.0, %49, %bb.5
+    %29:vgpr_32 = V_ADD_U32_e64 %28, 1, 0, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    %38:vgpr_32 = PHI %3, %bb.0, %49, %bb.4
+    %39:vgpr_32 = V_ADD_U32_e64 %38, 2, 0, implicit $exec
+
+  bb.3:
+    %49:vgpr_32 = PHI %29, %bb.1, %39, %bb.2
+
+  bb.4:
+    successors: %bb.2, %bb.5
+
+    %50:vgpr_32 = V_AND_B32_e32 3, %2, implicit $exec
+    %51:sreg_64 = V_CMP_EQ_U32_e64 %50, 2, implicit $exec
+    %52:sreg_64 = SI_IF killed %51:sreg_64, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.5:
+    successors: %bb.1, %bb.6
+    %61:sreg_64 = V_CMP_EQ_U32_e64 %50, 1, implicit $exec
+    %62:sreg_64 = SI_IF killed %61:sreg_64, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.6:
+    S_ENDPGM 0
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/irreducible-2-gmir.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/irreducible-2-gmir.mir
new file mode 100644
index 0000000000000..dcaaf6e1aa59b
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/irreducible/irreducible-2-gmir.mir
@@ -0,0 +1,89 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+
+#        bb0(div)
+#       /    \
+#     bb1 <-> bb2
+#              |
+#             bb3
+# CHECK-LABEL: MachineUniformityInfo for function: cycle_diverge_enter
+# CHECK-NEXT: CYCLES ASSSUMED DIVERGENT:
+# CHECK-NEXT: depth=1: entries(bb.2 bb.1)
+# CHECK-NEXT: CYCLES WITH DIVERGENT EXIT:
+# CHECK-NEXT: depth=1: entries(bb.2 bb.1)
+---
+name:            cycle_diverge_enter
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+
+    %0:_(s32) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(s32) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 1
+    %4:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %6:_(s1) = G_ICMP intpred(slt), %4(s32), %1 ; DIVERGENT CONDITION
+    %7:_(s1) = G_ICMP intpred(slt), %2(s32), %1 ; UNIFORM CONDITION
+    G_BRCOND %6(s1), %bb.1 ; divergent branch
+    G_BR %bb.2
+  bb.1:
+    successors: %bb.2
+
+    %8:_(s32) = G_PHI %1(s32), %bb.0, %0(s32), %bb.2
+    G_BR %bb.2
+  bb.2:
+    successors: %bb.1, %bb.3
+
+    %9:_(s32) = G_PHI %2(s32), %bb.1, %3(s32), %bb.0
+    %10:_(s1) = G_ICMP intpred(eq), %9(s32), %1(s32)
+    G_BRCOND %10(s1), %bb.3  ; divergent branch
+    G_BR %bb.1
+
+  bb.3:
+    %11:_(s32), %12:_(s1) = G_UADDO %9, %3
+    S_ENDPGM 0
+
+...
+
+
+# CHECK-LABEL: MachineUniformityInfo for function: cycle_diverge_exit
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32), %{{[0-9]*}}:_(s1) = G_UADDO %8:_, %{{[0-9]*}}:_
+#        bb0
+#       /    \
+#     bb1 <-> bb2(div)
+#              |
+#             bb3
+---
+name:            cycle_diverge_exit
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+
+    %0:_(s32) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(s32) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 1
+    %4:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %6:_(s1) = G_ICMP intpred(slt), %4(s32), %1 ; DIVERGENT CONDITION
+    %7:_(s1) = G_ICMP intpred(slt), %2(s32), %1 ; UNIFORM CONDITION
+    G_BRCOND %7(s1), %bb.1 ; uniform branch
+    G_BR %bb.2
+  bb.1:
+    successors: %bb.2
+
+    %8:_(s32) = G_PHI %1(s32), %bb.0, %0(s32), %bb.2
+    G_BR %bb.2
+  bb.2:
+    successors: %bb.1, %bb.3
+
+    %9:_(s32) = G_PHI %2(s32), %bb.1, %3(s32), %bb.0
+    %10:_(s1) = G_ICMP intpred(sgt), %9(s32), %1(s32)
+    G_BRCOND %6(s1), %bb.3  ; divergent branch
+    G_BR %bb.1
+
+  bb.3:
+    %11:_(s32), %12:_(s1) = G_UADDO %9, %3
+    S_ENDPGM 0
+
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/join-loopexit-gmir.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/join-loopexit-gmir.mir
new file mode 100644
index 0000000000000..4a1511b0baec2
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/join-loopexit-gmir.mir
@@ -0,0 +1,57 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+# CHECK-LABEL: MachineUniformityInfo for function: test
+
+# CHECK-LABEL: BLOCK bb.0
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(eq), %{{[0-9]*}}:_(s32), %{{[0-9]*}}:_
+
+# CHECK-LABEL: BLOCK bb.1
+# CHECK-NOT: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.0, %{{[0-9]*}}:_(s32), %bb.2
+# CHECK:  DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.3
+
+# CHECK-LABEL: BLOCK bb.2
+# CHECK-NOT: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32), %{{[0-9]*}}:_(s1) = G_UADDO_
+# CHECK-NOT: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.3
+
+# CHECK-LABEL: BLOCK bb.3
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_PHI %{{[0-9]*}}:_(s1), %bb.1, %{{[0-9]*}}:_(s1), %bb.2
+# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.4
+# CHECK: DIVERGENT: G_BR %bb.5
+
+---
+name:            test
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s1) = G_CONSTANT i1 false
+    %1:_(s32) = G_CONSTANT i32 0
+    %20:_(s32) = G_CONSTANT i32 7
+    %5:_(s32) = G_CONSTANT i32 -1
+    %4:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %6:_(s1) = G_ICMP intpred(eq), %4(s32), %5
+
+  bb.1:
+    successors: %bb.2, %bb.3
+
+    %8:_(s32) = G_PHI %20(s32), %bb.0, %21(s32), %bb.2
+    G_BRCOND %6(s1), %bb.3   ; Entrance to loop is divergent
+  bb.2:
+    successors: %bb.3, %bb.1
+
+    %21:_(s32), %22:_(s1) = G_UADDO %8, %5
+    %23:_(s1) = G_ICMP intpred(eq), %21(s32), %1
+    G_BRCOND %23(s1), %bb.3
+    G_BR %bb.1
+
+  bb.3:
+    %31:_(s1) = G_PHI %2(s1), %bb.1, %3(s1), %bb.2
+    S_ENDPGM 0
+    G_BRCOND %31(s1), %bb.4
+    G_BR %bb.5
+  bb.4:
+    G_BR %bb.5
+  bb.5:
+
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/loads-gmir.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/loads-gmir.mir
new file mode 100644
index 0000000000000..4f20f4b433e1b
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/loads-gmir.mir
@@ -0,0 +1,36 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+
+---
+name:            loads
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    %1:_(p0) = G_IMPLICIT_DEF
+    %4:_(p1) = G_IMPLICIT_DEF
+    %6:_(p5) = G_IMPLICIT_DEF
+
+    ; Atomic load
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_LOAD
+    %0:_(s32) = G_LOAD %1(p0) :: (load seq_cst (s32) from `ptr undef`)
+
+    ; flat load
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_LOAD
+    %2:_(s32) = G_LOAD %1(p0) :: (load (s32) from `ptr undef`)
+
+    ; Gloabal load
+    ; CHECK-NOT: DIVERGENT
+    %3:_(s32) = G_LOAD %4(p1) :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
+
+    ; Private load
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_LOAD
+    %5:_(s32) = G_LOAD %6(p5) :: (volatile load (s32) from `ptr addrspace(5) undef`, addrspace 5)
+    G_STORE %2(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    G_STORE %3(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    G_STORE %5(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    G_STORE %0(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    SI_RETURN
+
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/never-uniform.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/never-uniform.mir
new file mode 100644
index 0000000000000..a125441613941
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/never-uniform.mir
@@ -0,0 +1,138 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+# loads from flat non uniform
+---
+name:            flatloads
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: flatloads
+    ; CHECK: DIVERGENT: %1
+    ; CHECK-NOT: DIVERGENT: %2
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+    %2:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
+    %3:sreg_32 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
+    S_ENDPGM 0
+...
+
+# loads from scratch non uniform
+---
+name:            scratchloads
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: scratchloads
+    ; CHECK: DIVERGENT: %1
+    %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1:vgpr_32 = SCRATCH_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
+    S_ENDPGM 0
+...
+
+# Global load
+---
+name:            globalloads
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: globalloads
+    ; CHECK: DIVERGENT: %2
+    ; CHECK-NOT: DIVERGENT: %3
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vreg_64 = IMPLICIT_DEF
+    %2:vgpr_32(s32) = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+    %3:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    %4:sreg_32 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
+    S_ENDPGM 0
+...
+
+# FIXME:: ADDTID might instruction incorrectly marked uniform
+---
+name:            dsreads
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: dsreads
+    ; CHECK-NEXT: ALL VALUES UNIFORM
+    %0:vreg_64 = IMPLICIT_DEF
+    $m0 = S_MOV_B32 0
+    %1:vgpr_32 = DS_READ_ADDTID_B32 0, 0, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+# copy source == $sgpr => uniform, $vgpr => divergent
+---
+name:            sgprcopy
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: sgprcopy
+    ; CHECK: DIVERGENT: %2
+    liveins: $sgpr0,$sgpr1,$vgpr0
+    %0:sgpr_32 = COPY $sgpr0
+    %1:vgpr_32 = COPY $sgpr1
+    %2:vgpr_32 = COPY $vgpr0
+    S_ENDPGM 0
+...
+
+# writelane is not uniform
+---
+name:            writelane
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: writelane
+    ; CHECK: DIVERGENT: %4
+    ; CHECK: DIVERGENT: %5
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
+    %3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
+    $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    $sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
+
+    %4:vgpr_32 = V_WRITELANE_B32 0, 0, %0, implicit $exec
+    %5:sreg_64 = V_CMP_EQ_U32_e64 %0, %4, implicit $exec
+    S_CBRANCH_VCCZ %bb.1, implicit $vcc
+
+  bb.1:
+    %16:vgpr_32 = IMPLICIT_DEF
+    S_ENDPGM 0
+...
+# Direclty reading physing vgpr not uniform
+---
+name:            physicalreg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: physicalreg
+    ; CHECK: DIVERGENT: %0
+    ; CHECK: DIVERGENT: %1
+    ; CHECK: DIVERGENT: %2
+    ; CHECK: DIVERGENT: %3
+    ; CHECK: DIVERGENT: %4
+    ; CHECK-NOT: DIVERGENT
+    ; CHECK: DIVERGENT: %5
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_AND_B32_e32 %1, $vgpr3, implicit $exec
+    %3:vgpr_32 = V_ADD_U32_e32 $vgpr2, $vgpr3, implicit $exec
+    %4:vgpr_32 = V_SUB_CO_U32_e32 $vgpr2, $vgpr3, implicit $exec, implicit-def $vcc
+    %5:vgpr_32 = V_AND_B32_e32 $vgpr4, $vgpr5, implicit $exec
+    S_ENDPGM 0
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir
new file mode 100644
index 0000000000000..77a856baf65e9
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir
@@ -0,0 +1,76 @@
+# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+# CHECK-LABEL: MachineUniformityInfo for function: hidden_loop_diverge
+
+# CHECK-LABEL: BLOCK bb.0
+# CHECK-NOT: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt), %{{[0-9]*}}:_(s32), %{{[0-9]*}}:_
+# CHECK-NOT: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.3
+# CHECK-NOT: DIVERGENT: G_BR %bb.1
+
+# CHECK-LABEL: BLOCK bb.1
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt), %{{[0-9]*}}:_(s32), %{{[0-9]*}}:_
+# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.3
+# CHECK: DIVERGENT: G_BR %bb.2
+
+# CHECK-LABEL: BLOCK bb.2
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(sgt), %{{[0-9]*}}:_(s32), %{{[0-9]*}}:_
+# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.4
+# CHECK: DIVERGENT: G_BR %bb.1
+
+# CHECK-LABEL: BLOCK bb.3
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.0, %{{[0-9]*}}:_(s32), %bb.1
+# CHECK-NOT: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.4
+# CHECK-NOT: DIVERGENT: G_BR %bb.5
+
+# CHECK-LABEL: BLOCK bb.4
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.3, %{{[0-9]*}}:_(s32), %bb.2
+
+# CHECK-LABEL: BLOCK bb.5
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.3, %{{[0-9]*}}:_(s32), %bb.4
+
+---
+name:            hidden_loop_diverge
+tracksRegLiveness: true
+body:             |
+    bb.0:
+        successors: %bb.3, %bb.1
+        liveins: $sgpr4_sgpr5
+
+        %0:_(s32) = G_IMPLICIT_DEF
+        %20:_(s32) = G_IMPLICIT_DEF
+        %21:_(s32) = G_CONSTANT i32 42
+        %22:_(s32) = G_IMPLICIT_DEF
+        %1:_(s32) = G_CONSTANT i32 0
+        %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+        %3:_(s1) = G_ICMP intpred(slt), %0(s32), %1
+        G_BRCOND %3(s1), %bb.3                            ; Uniform branch
+        G_BR %bb.1
+    bb.1:
+        successors: %bb.3, %bb.2
+
+        %4:_(s32) = G_PHI %1(s32), %bb.0, %7(s32), %bb.2
+        %5:_(s1) = G_ICMP intpred(slt), %1(s32), %2(s32)
+        G_BRCOND %5(s1), %bb.3
+        G_BR %bb.2
+    bb.2:
+        successors: %bb.4, %bb.1
+
+        %6:_(s32) = G_CONSTANT i32 1
+        %7:_(s32) = G_ADD %6(s32), %4(s32)
+        %8:_(s1) = G_ICMP intpred(sgt), %2(s32), %1(s32)
+        G_BRCOND %8(s1), %bb.4
+        G_BR %bb.1
+    bb.3:
+        successors: %bb.4, %bb.5
+
+        %9:_(s32) = G_PHI %20(s32), %bb.0, %4(s32), %bb.1    ; Temporal divergent phi
+        G_BRCOND %3(s1), %bb.4
+        G_BR %bb.5
+
+    bb.4:
+        successors: %bb.5
+
+        %10:_(s32) = G_PHI %21(s32), %bb.3, %22(s32), %bb.2
+        G_BR %bb.5
+    bb.5:
+        %11:_(s32) = G_PHI %20(s32), %bb.3, %22(s32), %bb.4
+...

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll
index ec4ccbd281a42..0f055183c7983 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: for function 'readfirstlane':
 define amdgpu_kernel void @readfirstlane() {
@@ -39,7 +40,7 @@ define i32 @asm_sgpr(i32 %divergent) {
   ret i32 %sgpr
 }
 
-; CHECK-LABEL: Divergence Analysis' for function 'asm_mixed_sgpr_vgpr':
+; CHECK-LABEL: for function 'asm_mixed_sgpr_vgpr':
 ; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1, $2", "=s,=v,v"(i32 %divergent)
 ; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 0
 ; CHECK-NEXT: DIVERGENT:       %vgpr = extractvalue { i32, i32 } %asm, 1

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
index 123d9b48abeca..7c7e5e17ef39b 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple amdgcn-- -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 ; CHECK: DIVERGENT: %orig = atomicrmw xchg ptr %ptr, i32 %val seq_cst
 define amdgpu_kernel void @test1(ptr %ptr, i32 %val) #0 {

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/b42473-r1-crash.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/b42473-r1-crash.ll
index f43f5f7d23df8..2b0a02cd6a5b8 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/b42473-r1-crash.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/b42473-r1-crash.ll
@@ -1,12 +1,13 @@
 ; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 declare i32 @gf2(i32)
 declare i32 @gf1(i32)
 
 define  void @tw1(ptr addrspace(4) noalias nocapture readonly %A, ptr addrspace(4) noalias nocapture %B) local_unnamed_addr #2 {
-; CHECK: Divergence Analysis' for function 'tw1':
-; CHECK: DIVERGENT: ptr addrspace(4) %A
-; CHECK: DIVERGENT: ptr addrspace(4) %B
+; CHECK: for function 'tw1':
+; CHECK-DAG: DIVERGENT: ptr addrspace(4) %A
+; CHECK-DAG: DIVERGENT: ptr addrspace(4) %B
 entry:
 ; CHECK: DIVERGENT:       %call = tail call i32 @gf2(i32 0) #0
 ; CHECK: DIVERGENT:       %cmp = icmp ult i32 %call, 16

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/control-flow-intrinsics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/control-flow-intrinsics.ll
index e26f0f8790b41..e2f3bfe46e716 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/control-flow-intrinsics.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/control-flow-intrinsics.ll
@@ -1,8 +1,9 @@
 ; RUN: opt -mtriple amdgcn-mesa-mesa3d -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-mesa-mesa3d -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 ; Tests control flow intrinsics that should be treated as uniform
 
-; CHECK: Divergence Analysis' for function 'test_if_break':
+; CHECK: for function 'test_if_break':
 ; CHECK: DIVERGENT: %cond = icmp eq i32 %arg0, 0
 ; CHECK-NOT: DIVERGENT
 ; CHECK: ret void
@@ -14,7 +15,7 @@ entry:
   ret void
 }
 
-; CHECK: Divergence Analysis' for function 'test_if':
+; CHECK: for function 'test_if':
 ; CHECK: DIVERGENT: %cond = icmp eq i32 %arg0, 0
 ; CHECK-NEXT: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
 ; CHECK-NEXT: DIVERGENT: %if.bool = extractvalue { i1, i64 } %if, 0
@@ -33,7 +34,7 @@ entry:
 }
 
 ; The result should still be treated as divergent, even with a uniform source.
-; CHECK: Divergence Analysis' for function 'test_if_uniform':
+; CHECK: for function 'test_if_uniform':
 ; CHECK-NOT: DIVERGENT
 ; CHECK: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
 ; CHECK-NEXT: DIVERGENT: %if.bool = extractvalue { i1, i64 } %if, 0
@@ -51,7 +52,7 @@ entry:
   ret void
 }
 
-; CHECK: Divergence Analysis' for function 'test_loop_uniform':
+; CHECK: for function 'test_loop_uniform':
 ; CHECK: DIVERGENT: %loop = call i1 @llvm.amdgcn.loop.i64(i64 %mask)
 define amdgpu_ps void @test_loop_uniform(i64 inreg %mask) {
 entry:
@@ -61,7 +62,7 @@ entry:
   ret void
 }
 
-; CHECK: Divergence Analysis' for function 'test_else':
+; CHECK: for function 'test_else':
 ; CHECK: DIVERGENT: %else = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
 ; CHECK: DIVERGENT:       %else.bool = extractvalue { i1, i64 } %else, 0
 ; CHECK: {{^[ \t]+}}%else.mask = extractvalue { i1, i64 } %else, 1
@@ -77,7 +78,7 @@ entry:
 }
 
 ; This case is probably always broken
-; CHECK: Divergence Analysis' for function 'test_else_divergent_mask':
+; CHECK: for function 'test_else_divergent_mask':
 ; CHECK: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
 ; CHECK-NEXT: DIVERGENT: %if.bool = extractvalue { i1, i64 } %if, 0
 ; CHECK-NOT: DIVERGENT

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll
index 767e2c1bd66e0..c2b707ac4251d 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll
@@ -1,11 +1,13 @@
 ; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 define amdgpu_kernel void @hidden_diverge(i32 %n, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: 'Divergence Analysis' for function 'hidden_diverge'
+; CHECK-LABEL: for function 'hidden_diverge'
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %cond.var = icmp slt i32 %tid, 0
   br i1 %cond.var, label %B, label %C ; divergent
+; CHECK:  DIVERGENT: %cond.var =
 ; CHECK: DIVERGENT: br i1 %cond.var,
 B:
   %cond.uni = icmp slt i32 %n, 0
@@ -22,7 +24,7 @@ merge:
 }
 
 define amdgpu_kernel void @hidden_loop_ipd(i32 %n, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: 'Divergence Analysis' for function 'hidden_loop_ipd'
+; CHECK-LABEL: for function 'hidden_loop_ipd'
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %cond.var = icmp slt i32 %tid, 0

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll
index d7d8bca5562b5..8aa13c4bb5faa 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll
@@ -1,9 +1,10 @@
 ; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 ; divergent loop (H<header><exiting to X>, B<exiting to Y>)
 ; the divergent join point in %exit is obscured by uniform control joining in %X
 define amdgpu_kernel void @hidden_loop_diverge(i32 %n, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: Divergence Analysis' for function 'hidden_loop_diverge'
+; CHECK-LABEL: for function 'hidden_loop_diverge':
 ; CHECK-NOT: DIVERGENT: %uni.
 ; CHECK-NOT: DIVERGENT: br i1 %uni.
 
@@ -45,7 +46,7 @@ exit:
 ; divergent loop (H<header><exiting to X>, B<exiting to Y>)
 ; the phi nodes in X and Y don't actually receive divergent values
 define amdgpu_kernel void @unobserved_loop_diverge(i32 %n, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: Divergence Analysis' for function 'unobserved_loop_diverge':
+; CHECK-LABEL: for function 'unobserved_loop_diverge':
 ; CHECK-NOT: DIVERGENT: %uni.
 ; CHECK-NOT: DIVERGENT: br i1 %uni.
 
@@ -86,7 +87,7 @@ exit:
 ; the inner loop has no exit to top level.
 ; the outer loop becomes divergent as its exiting branch in C is control-dependent on the inner loop's divergent loop exit in D.
 define amdgpu_kernel void @hidden_nestedloop_diverge(i32 %n, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: Divergence Analysis' for function 'hidden_nestedloop_diverge':
+; CHECK-LABEL: for function 'hidden_nestedloop_diverge':
 ; CHECK-NOT: DIVERGENT: %uni.
 ; CHECK-NOT: DIVERGENT: br i1 %uni.
 
@@ -137,7 +138,7 @@ exit:
 ; the outer loop has no immediately divergent exiting edge.
 ; the inner exiting edge is exiting to top-level through the outer loop causing both to become divergent.
 define amdgpu_kernel void @hidden_doublebreak_diverge(i32 %n, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: Divergence Analysis' for function 'hidden_doublebreak_diverge':
+; CHECK-LABEL: for function 'hidden_doublebreak_diverge':
 ; CHECK-NOT: DIVERGENT: %uni.
 ; CHECK-NOT: DIVERGENT: br i1 %uni.
 
@@ -179,7 +180,7 @@ Y:
 
 ; divergent loop (G<header>, L<exiting to D>) contained inside a uniform loop (H<header>, B, G, L , D<exiting to x>)
 define amdgpu_kernel void @hidden_containedloop_diverge(i32 %n, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: Divergence Analysis' for function 'hidden_containedloop_diverge':
+; CHECK-LABEL: for function 'hidden_containedloop_diverge':
 ; CHECK-NOT: DIVERGENT: %uni.
 ; CHECK-NOT: DIVERGENT: br i1 %uni.
 

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll
index 5f8176acb9495..8d3df813bc140 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll
@@ -1,50 +1,52 @@
 ; RUN: opt -mtriple amdgcn-unknown-amdhsa -mcpu=tahiti -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
 ; RUN: opt -mtriple amdgcn-unknown-amdhsa -mcpu=gfx908 -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -mcpu=tahiti -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -mcpu=gfx908 -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 ; Make sure nothing crashes on targets with or without AGPRs
 
-; CHECK: Divergence Analysis' for function 'inline_asm_1_sgpr_virtreg_output':
+; CHECK-LABEL: for function 'inline_asm_1_sgpr_virtreg_output':
 ; CHECK-NOT: DIVERGENT
 define i32 @inline_asm_1_sgpr_virtreg_output() {
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
   ret i32 %sgpr
 }
 
-; CHECK: Divergence Analysis' for function 'inline_asm_1_sgpr_physreg_output':
+; CHECK-LABEL: for function 'inline_asm_1_sgpr_physreg_output':
 ; CHECK-NOT: DIVERGENT
 define i32 @inline_asm_1_sgpr_physreg_output() {
   %sgpr = call i32 asm "s_mov_b32 s0, 0", "={s0}"()
   ret i32 %sgpr
 }
 
-; CHECK: Divergence Analysis' for function 'inline_asm_1_vgpr_virtreg_output':
+; CHECK-LABEL: for function 'inline_asm_1_vgpr_virtreg_output':
 ; CHECK: DIVERGENT: %vgpr = call i32 asm "v_mov_b32 $0, 0", "=v"()
 define i32 @inline_asm_1_vgpr_virtreg_output() {
   %vgpr = call i32 asm "v_mov_b32 $0, 0", "=v"()
   ret i32 %vgpr
 }
 
-; CHECK: Divergence Analysis' for function 'inline_asm_1_vgpr_physreg_output':
+; CHECK-LABEL: for function 'inline_asm_1_vgpr_physreg_output':
 ; CHECK: DIVERGENT: %vgpr = call i32 asm "v_mov_b32 v0, 0", "={v0}"()
 define i32 @inline_asm_1_vgpr_physreg_output() {
   %vgpr = call i32 asm "v_mov_b32 v0, 0", "={v0}"()
   ret i32 %vgpr
 }
 
-; CHECK: Divergence Analysis' for function 'inline_asm_1_agpr_virtreg_output':
+; CHECK-LABEL: for function 'inline_asm_1_agpr_virtreg_output':
 ; CHECK: DIVERGENT: %vgpr = call i32 asm "; def $0", "=a"()
 define i32 @inline_asm_1_agpr_virtreg_output() {
   %vgpr = call i32 asm "; def $0", "=a"()
   ret i32 %vgpr
 }
 
-; CHECK: Divergence Analysis' for function 'inline_asm_1_agpr_physreg_output':
+; CHECK-LABEL: for function 'inline_asm_1_agpr_physreg_output':
 ; CHECK: DIVERGENT: %vgpr = call i32 asm "; def a0", "={a0}"()
 define i32 @inline_asm_1_agpr_physreg_output() {
   %vgpr = call i32 asm "; def a0", "={a0}"()
   ret i32 %vgpr
 }
 
-; CHECK: Divergence Analysis' for function 'inline_asm_2_sgpr_virtreg_output':
+; CHECK-LABEL: for function 'inline_asm_2_sgpr_virtreg_output':
 ; CHECK-NOT: DIVERGENT
 define void @inline_asm_2_sgpr_virtreg_output() {
   %asm = call { i32, i32 } asm "; def $0, $1", "=s,=s"()
@@ -56,7 +58,7 @@ define void @inline_asm_2_sgpr_virtreg_output() {
 }
 
 ; One output is SGPR, one is VGPR. Infer divergent for the aggregate, but uniform on the SGPR extract
-; CHECK: Divergence Analysis' for function 'inline_asm_sgpr_vgpr_virtreg_output':
+; CHECK-LABEL: for function 'inline_asm_sgpr_vgpr_virtreg_output':
 ; CHECK: DIVERGENT:       %asm = call { i32, i32 } asm "; def $0, $1", "=s,=v"()
 ; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 0
 ; CHECK-NEXT: DIVERGENT:       %vgpr = extractvalue { i32, i32 } %asm, 1
@@ -69,7 +71,7 @@ define void @inline_asm_sgpr_vgpr_virtreg_output() {
   ret void
 }
 
-; CHECK: Divergence Analysis' for function 'inline_asm_vgpr_sgpr_virtreg_output':
+; CHECK-LABEL: for function 'inline_asm_vgpr_sgpr_virtreg_output':
 ; CHECK: DIVERGENT:       %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s"()
 ; CHECK-NEXT: DIVERGENT:       %vgpr = extractvalue { i32, i32 } %asm, 0
 ; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 1
@@ -83,7 +85,7 @@ define void @inline_asm_vgpr_sgpr_virtreg_output() {
 }
 
 ; Have an extra output constraint
-; CHECK: Divergence Analysis' for function 'multi_sgpr_inline_asm_output_input_constraint':
+; CHECK-LABEL: for function 'multi_sgpr_inline_asm_output_input_constraint':
 ; CHECK-NOT: DIVERGENT
 define void @multi_sgpr_inline_asm_output_input_constraint() {
   %asm = call { i32, i32 } asm "; def $0, $1", "=s,=s,s"(i32 1234)
@@ -94,7 +96,7 @@ define void @multi_sgpr_inline_asm_output_input_constraint() {
   ret void
 }
 
-; CHECK: Divergence Analysis' for function 'inline_asm_vgpr_sgpr_virtreg_output_input_constraint':
+; CHECK-LABEL: for function 'inline_asm_vgpr_sgpr_virtreg_output_input_constraint':
 ; CHECK: DIVERGENT:       %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s,v"(i32 1234)
 ; CHECK-NEXT: DIVERGENT:       %vgpr = extractvalue { i32, i32 } %asm, 0
 ; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 1

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/interp_f16.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/interp_f16.ll
index 88d7a53d8b38d..420c0299d2341 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/interp_f16.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/interp_f16.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple amdgcn-- -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 ; CHECK: for function 'interp_p1_f16'
 ; CHECK: DIVERGENT:       %p1 = call float @llvm.amdgcn.interp.p1.f16

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
index 564742137256d..13709e061c32c 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple amdgcn-- -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 ; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
 define amdgpu_kernel void @ds_swizzle(ptr addrspace(1) %out, i32 %src) #0 {

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/branch-outside.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/branch-outside.ll
new file mode 100644
index 0000000000000..0227f7c8d762d
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/branch-outside.ll
@@ -0,0 +1,81 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+; CHECK=LABEL: UniformityInfo for function 'basic':
+; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK:   depth=1: entries(P T) Q
+define amdgpu_kernel void @basic(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cond.uni = icmp slt i32 %a, 0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cond.div = icmp slt i32 %tid, 0
+ br i1 %cond.div, label %T, label %P
+
+P:
+; CHECK:   DIVERGENT:   %pp.phi =
+; CHECK: DIVERGENT:      %pp =
+  %pp.phi  = phi i32 [ %a, %entry], [ %b, %T ]
+  %pp = add i32 %b, 1
+  br label %Q
+
+Q:
+; CHECK: DIVERGENT:   %qq =
+; CHECK:   DIVERGENT:   %qq.div =
+  %qq = add i32 %b, 1
+  %qq.div = add i32 %pp.phi, 1
+  br i1 %cond.uni, label %T, label %exit
+
+T:
+; CHECK:   DIVERGENT:   %t.phi =
+; CHECK: DIVERGENT:     %tt =
+  %t.phi = phi i32 [ %qq, %Q ], [ %a, %entry ]
+  %tt = add i32 %b, 1
+  br label %P
+
+exit:
+; CHECK-NOT: DIVERGENT:     %ee =
+  %ee = add i32 %b, 1
+  ret void
+}
+
+; CHECK=LABEL: UniformityInfo for function 'nested':
+; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK:  depth=1: entries(P T) Q A C B
+define amdgpu_kernel void @nested(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cond.uni = icmp slt i32 %a, 0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cond.div = icmp slt i32 %tid, 0
+ br i1 %cond.div, label %T, label %P
+
+P:
+ %pp.phi  = phi i32 [ %a, %entry], [ %b, %T ]
+ %pp = add i32 %b, 1
+ br i1 %cond.uni, label %B, label %Q
+
+Q:
+  %qq = add i32 %b, 1
+  br i1 %cond.uni, label %T, label %exit
+
+A:
+  %aa = add i32 %b, 1
+  br label %B
+
+B:
+  %bb = add i32 %b, 1
+  br label %C
+
+C:
+  %cc = add i32 %b, 1
+  br i1 %cond.uni, label %Q, label %A
+
+T:
+  %t.phi = phi i32 [ %qq, %Q ], [ %a, %entry ]
+  %tt = add i32 %b, 1
+  br i1 %cond.uni, label %A, label %P
+
+exit:
+  %ee = add i32 %b, 1
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/diverged-entry-basic.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/diverged-entry-basic.ll
new file mode 100644
index 0000000000000..fb83b19b95d86
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/diverged-entry-basic.ll
@@ -0,0 +1,80 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_kernel void @divergent_cycle_1(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: UniformityInfo for function 'divergent_cycle_1':
+; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK:   depth=1: entries(R P) S Q
+; CHECK: CYCLES WITH DIVERGENT EXIT:
+; CHECK:   depth=2: entries(S P) Q
+; CHECK:   depth=1: entries(R P) S Q
+entry:
+ %cond.uni = icmp slt i32 %a, 0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cond.div = icmp slt i32 %tid, 0
+ br i1 %cond.uni, label %P, label %R
+
+P:
+; CHECK:   DIVERGENT:   %pp.phi =
+  %pp.phi  = phi i32 [ %a, %entry], [ %b, %S ]
+  %pp = add i32 %b, 1
+  br label %Q
+
+Q:
+  %qq = add i32 %b, 1
+  br i1 %cond.div, label %S, label %R
+
+R:
+  %rr = add i32 %b, 1
+  br label %S
+
+S:
+; CHECK:   DIVERGENT:   %s.phi =
+  %s.phi = phi i32 [ %qq, %Q ], [ %rr, %R ]
+  %ss = add i32 %b, 1
+  br i1 %cond.uni, label %exit, label %P
+
+exit:
+  %ee = add i32 %b, 1
+  ret void
+}
+
+define amdgpu_kernel void @uniform_cycle_1(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: UniformityInfo for function 'uniform_cycle_1':
+; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
+entry:
+ %cond.uni = icmp slt i32 %a, 0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cond.div = icmp slt i32 %tid, 0
+ br i1 %cond.uni, label %P, label %T
+
+P:
+; CHECK-NOT:   DIVERGENT:   %pp.phi = phi i32
+  %pp.phi  = phi i32 [ %a, %entry], [ %b, %T ]
+  %pp = add i32 %b, 1
+  br label %Q
+
+Q:
+  %qq = add i32 %b, 1
+  br i1 %cond.div, label %S, label %R
+
+R:
+  %rr = add i32 %b, 1
+  br label %S
+
+S:
+; CHECK:   DIVERGENT:   %s.phi =
+  %s.phi = phi i32 [ %qq, %Q ], [ %rr, %R ]
+  %ss = add i32 %b, 1
+  br i1 %cond.uni, label %exit, label %T
+
+T:
+  %tt = add i32 %b, 1
+  br label %P
+
+exit:
+  %ee = add i32 %b, 1
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
new file mode 100644
index 0000000000000..46e676b52c0ba
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
@@ -0,0 +1,240 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+; These tests have identical control flow graphs with slight changes
+; that affect cycle-info. There is a minor functional 
diff erence in
+; the branch conditions; but that is not relevant to the tests.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; The inner cycle has a header (P) that dominates the join, hence
+;; both cycles are reported as converged.
+;;
+;; CHECK-LABEL: UniformityInfo for function 'headers_b_p':
+;; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+;; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
+
+define amdgpu_kernel void @headers_b_p(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cond.uni = icmp slt i32 %a, 0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cond.div = icmp slt i32 %tid, 0
+ %a.div = add i32 %tid, %a
+ br i1 %cond.uni, label %B, label %A
+
+A:
+ br label %B
+
+B:
+ br i1 %cond.uni, label %C, label %D
+
+C:
+ br i1 %cond.uni, label %T, label %P
+
+P:
+  %pp.phi  = phi i32 [ %a, %C], [ %b, %T ]
+  %pp = add i32 %b, 1
+  br i1 %cond.uni, label %R, label %Q
+
+Q:
+  %qq = add i32 %b, 1
+  br i1 %cond.div, label %S, label %R
+
+R:
+  %rr = add i32 %b, 1
+  br label %S
+
+S:
+  %s.phi = phi i32 [ %qq, %Q ], [ %rr, %R ]
+  %ss = add i32 %pp.phi, 1
+  br i1 %cond.uni, label %D, label %T
+
+D:
+  br i1 %cond.uni, label %exit, label %A
+
+T:
+  %tt.phi = phi i32 [ %ss, %S ], [ %a, %C ]
+  %tt = add i32 %b, 1
+  br label %P
+
+exit:
+  %ee = add i32 %b, 1
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Same as previous, but the outer cycle has a 
diff erent header (A).
+;; The inner cycle has a header (P) that dominates the join, hence
+;; both cycles are reported as converged.
+;;
+;; CHECK-LABEL: UniformityInfo for function 'headers_a_p':
+;; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+;; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
+
+define amdgpu_kernel void @headers_a_p(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cond.uni = icmp slt i32 %a, 0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cond.div = icmp slt i32 %tid, 0
+ %a.div = add i32 %tid, %a
+ br i1 %cond.uni, label %B, label %A
+
+A:
+ br label %B
+
+B:
+ br i1 %cond.uni, label %C, label %D
+
+C:
+ br i1 %cond.uni, label %T, label %P
+
+P:
+  %pp.phi  = phi i32 [ %a, %C], [ %b, %T ]
+  %pp = add i32 %b, 1
+  br i1 %cond.uni, label %R, label %Q
+
+Q:
+  %qq = add i32 %b, 1
+  br i1 %cond.div, label %S, label %R
+
+R:
+  %rr = add i32 %b, 1
+  br label %S
+
+S:
+  %s.phi = phi i32 [ %qq, %Q ], [ %rr, %R ]
+  %ss = add i32 %pp.phi, 1
+  br i1 %cond.uni, label %D, label %T
+
+D:
+  br i1 %cond.uni, label %exit, label %A
+
+T:
+  %tt.phi = phi i32 [ %ss, %S ], [ %a, %C ]
+  %tt = add i32 %b, 1
+  br label %P
+
+exit:
+  %ee = add i32 %b, 1
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; The inner cycle has a header (T) that does not dominate the join.
+;; The outer cycle has a header (B) that dominates the join. Hence
+;; only the inner cycle is reported as diverged.
+;;
+;; CHECK-LABEL: UniformityInfo for function 'headers_b_t':
+;; CHECK: CYCLES ASSSUMED DIVERGENT:
+;; CHECK:   depth=2: entries(T P) S Q R
+;; CHECK: CYCLES WITH DIVERGENT EXIT:
+;; CHECK:   depth=1: entries(B A) D T S Q P R C
+
+define amdgpu_kernel void @headers_b_t(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cond.uni = icmp slt i32 %a, 0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cond.div = icmp slt i32 %tid, 0
+ %a.div = add i32 %tid, %a
+ br i1 %cond.uni, label %A, label %B
+
+A:
+ br label %B
+
+B:
+ br i1 %cond.uni, label %C, label %D
+
+C:
+ br i1 %cond.uni, label %P, label %T
+
+P:
+  %pp.phi  = phi i32 [ %a, %C], [ %b, %T ]
+  %pp = add i32 %b, 1
+  br i1 %cond.uni, label %R, label %Q
+
+Q:
+  %qq = add i32 %b, 1
+  br i1 %cond.div, label %S, label %R
+
+R:
+  %rr = add i32 %b, 1
+  br label %S
+
+S:
+  %s.phi = phi i32 [ %qq, %Q ], [ %rr, %R ]
+  %ss = add i32 %pp.phi, 1
+  br i1 %cond.uni, label %D, label %T
+
+D:
+  br i1 %cond.uni, label %exit, label %A
+
+T:
+  %tt.phi = phi i32 [ %ss, %S ], [ %a, %C ]
+  %tt = add i32 %b, 1
+  br label %P
+
+exit:
+  %ee = add i32 %b, 1
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; The cycles have headers (A and T) that do not dominate the join.
+;; Hence the outermost cycle is reported as diverged.
+;;
+;; CHECK-LABEL: UniformityInfo for function 'headers_a_t':
+;; CHECK: CYCLES ASSSUMED DIVERGENT:
+;; CHECK:   depth=1: entries(A B) D T S Q P R C
+;; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
+
+define amdgpu_kernel void @headers_a_t(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cond.uni = icmp slt i32 %a, 0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cond.div = icmp slt i32 %tid, 0
+ %a.div = add i32 %tid, %a
+ br i1 %cond.uni, label %B, label %A
+
+A:
+ br label %B
+
+B:
+ br i1 %cond.uni, label %C, label %D
+
+C:
+ br i1 %cond.uni, label %P, label %T
+
+P:
+  %pp.phi  = phi i32 [ %a, %C], [ %b, %T ]
+  %pp = add i32 %b, 1
+  br i1 %cond.uni, label %R, label %Q
+
+Q:
+  %qq = add i32 %b, 1
+  br i1 %cond.div, label %S, label %R
+
+R:
+  %rr = add i32 %b, 1
+  br label %S
+
+S:
+  %s.phi = phi i32 [ %qq, %Q ], [ %rr, %R ]
+  %ss = add i32 %pp.phi, 1
+  br i1 %cond.uni, label %D, label %T
+
+D:
+  br i1 %cond.uni, label %exit, label %A
+
+T:
+  %tt.phi = phi i32 [ %ss, %S ], [ %a, %C ]
+  %tt = add i32 %b, 1
+  br label %P
+
+exit:
+  %ee = add i32 %b, 1
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll
new file mode 100644
index 0000000000000..335026dc9b62b
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll
@@ -0,0 +1,103 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+; These tests have identical control flow graphs with slight changes
+; that affect cycle-info. There is a minor functional 
diff erence in
+; the branch conditions; but that is not relevant to the tests.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; The cycle has a header (T) that does not dominate the join, hence
+;; the entire cycle is reported as converged.
+;;
+;; CHECK-LABEL: UniformityInfo for function 't_header':
+;; CHECK: CYCLES ASSSUMED DIVERGENT:
+;; CHECK:   depth=1: entries(T P) S Q R
+
+define amdgpu_kernel void @t_header(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cond.uni = icmp slt i32 %a, 0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cond.div = icmp slt i32 %tid, 0
+ %a.div = add i32 %tid, %a
+ br i1 %cond.uni, label %P, label %T
+
+P:
+; CHECK:   DIVERGENT:   %pp.phi =
+  %pp.phi  = phi i32 [ %a, %entry], [ %b, %T ]
+  %pp = add i32 %b, 1
+  br i1 %cond.uni, label %R, label %Q
+
+Q:
+  %qq = add i32 %b, 1
+  br i1 %cond.div, label %S, label %R
+
+R:
+  %rr = add i32 %b, 1
+  br label %S
+
+S:
+; CHECK:   DIVERGENT:   %s.phi =
+; CHECK:   DIVERGENT:   %ss =
+  %s.phi = phi i32 [ %qq, %Q ], [ %rr, %R ]
+  %ss = add i32 %pp.phi, 1
+  br i1 %cond.uni, label %exit, label %T
+
+T:
+; CHECK:   DIVERGENT:   %tt.phi =
+  %tt.phi = phi i32 [ %ss, %S ], [ %a, %entry ]
+  %tt = add i32 %b, 1
+  br label %P
+
+exit:
+  %ee = add i32 %b, 1
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; The cycle has a header (P) that dominates the join, hence
+;; the cycle is reported as converged.
+;;
+;; CHECK-LABEL: UniformityInfo for function 'p_header':
+;; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+
+define amdgpu_kernel void @p_header(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cond.uni = icmp slt i32 %a, 0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cond.div = icmp slt i32 %tid, 0
+ br i1 %cond.uni, label %T, label %P
+
+P:
+; CHECK-NOT:   DIVERGENT:   %pp.phi = phi i32
+  %pp.phi  = phi i32 [ %a, %entry], [ %b, %T ]
+  %pp = add i32 %b, 1
+  br i1 %cond.uni, label %R, label %Q
+
+Q:
+  %qq = add i32 %b, 1
+  br i1 %cond.div, label %S, label %R
+
+R:
+  %rr = add i32 %b, 1
+  br label %S
+
+S:
+; CHECK:   DIVERGENT:   %s.phi =
+; CHECK-NOT:   DIVERGENT:   %ss = add i32
+  %s.phi = phi i32 [ %qq, %Q ], [ %rr, %R ]
+  %ss = add i32 %pp.phi, 1
+  br i1 %cond.uni, label %exit, label %T
+
+T:
+; CHECK-NIT:   DIVERGENT:   %tt.phi = phi i32
+  %tt.phi = phi i32 [ %ss, %S ], [ %a, %entry ]
+  %tt = add i32 %b, 1
+  br label %P
+
+exit:
+  %ee = add i32 %b, 1
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/exit-divergence.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/exit-divergence.ll
new file mode 100644
index 0000000000000..2a3ff4166213d
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/exit-divergence.ll
@@ -0,0 +1,139 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+; CHECK=LABEL: UniformityInfo for function 'basic':
+; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+; CHECK: CYCLES WITH DIVERGENT EXIT:
+; CHECK:   depth=1: entries(P T) Q
+define amdgpu_kernel void @basic(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cond.uni = icmp slt i32 %a, 0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cond.div = icmp slt i32 %tid, 0
+ br i1 %cond.uni, label %T, label %P
+
+P:
+  %pp.phi.1  = phi i32 [ %a, %entry], [ %b, %T ]
+  %pp.phi.2  = phi i32 [ %a, %entry], [ %tt.phi, %T ]
+  %pp = add i32 %b, 1
+  br label %Q
+
+Q:
+  %qq = add i32 %b, 1
+  %qq.div.1 = add i32 %pp.phi.2, 1
+  %qq.div.2 = add i32 %pp.phi.2, 1
+  br i1 %cond.div, label %T, label %exit
+
+T:
+  %tt.phi = phi i32 [ %qq, %Q ], [ %a, %entry ]
+  %tt = add i32 %b, 1
+  br label %P
+
+exit:
+; CHECK:   DIVERGENT:   %ee.1 =
+; CHECK:   DIVERGENT:   %xx.2 =
+; CHECK-NOT: DIVERGENT:     %ee.3 =
+  %ee.1 = add i32 %pp.phi.1, 1
+  %xx.2 = add i32 %pp.phi.2, 1
+  %ee.3 = add i32 %b, 1
+  ret void
+}
+
+; CHECK-LABEL: UniformityInfo for function 'outer_reducible':
+; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+; CHECK: CYCLES WITH DIVERGENT EXIT:
+; CHECK:   depth=1: entries(H) P T R Q
+define amdgpu_kernel void @outer_reducible(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cond.uni = icmp slt i32 %a, 0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cond.div = icmp slt i32 %tid, 0
+ br label %H
+
+H:
+ br i1 %cond.uni, label %T, label %P
+
+P:
+  %pp.phi.1  = phi i32 [ %a, %H], [ %b, %T ]
+  %pp.phi.2  = phi i32 [ %a, %H], [ %tt.phi, %T ]
+  %pp = add i32 %b, 1
+  br label %Q
+
+Q:
+  %qq = add i32 %b, 1
+  %qq.div.1 = add i32 %pp.phi.2, 1
+  %qq.div.2 = add i32 %pp.phi.2, 1
+  br i1 %cond.div, label %R, label %exit
+
+R:
+  br i1 %cond.uni, label %T, label %H
+
+
+T:
+  %tt.phi = phi i32 [ %qq, %R ], [ %a, %H ]
+  %tt = add i32 %b, 1
+  br label %P
+
+exit:
+; CHECK:   DIVERGENT:   %ee.1 =
+; CHECK:   DIVERGENT:   %xx.2 =
+; CHECK-NOT: DIVERGENT:     %ee.3 =
+  %ee.1 = add i32 %pp.phi.1, 1
+  %xx.2 = add i32 %pp.phi.2, 1
+  %ee.3 = add i32 %b, 1
+  ret void
+}
+
+;      entry(div)
+;      |   \
+;      H -> B
+;      ^   /|
+;      \--C |
+;          \|
+;           X
+;
+; This has a divergent cycle due to the external divergent branch, but
+; there are no divergent exits. Hence a use at X is not divergent
+; unless the def itself is divergent.
+;
+; CHECK-LABEL: UniformityInfo for function 'no_divergent_exit':
+; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK:   depth=1: entries(H B) C
+; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
+define amdgpu_kernel void @no_divergent_exit(i32 %n, i32 %a, i32 %b) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %div.cond, label %B, label %H
+
+H:                                                ; preds = %C, %entry
+; CHECK: DIVERGENT:  %div.merge.h =
+  %div.merge.h = phi i32 [ 0, %entry ], [ %b, %C ]
+  br label %B
+
+B:                                                ; preds = %H, %entry
+; CHECK: DIVERGENT:  %div.merge.b =
+  %div.merge.b = phi i32 [ %a, %H ], [ 1, %entry ]
+; CHECK-NOT: DIVERGENT  %bb =
+  %bb = add i32 %a, 1
+; CHECK-NOT: DIVERGENT:  br i1 %uni.cond, label %X, label %C
+  br i1 %uni.cond, label %X, label %C
+
+C:                                                ; preds = %B
+; CHECK-NOT: DIVERGENT  %cc =
+  %cc = add i32 %a, 1
+; CHECK-NOT: DIVERGENT:  br i1 %uni.cond, label %X, label %H
+  br i1 %uni.cond, label %X, label %H
+
+; CHECK-LABEL: BLOCK X
+X:                                                ; preds = %C, %B
+; CHECK: DIVERGENT:  %uni.merge.x =
+  %uni.merge.x = phi i32 [ %bb, %B ], [%cc, %C ]
+; CHECK: DIVERGENT: %div.merge.x =
+  %div.merge.x = phi i32 [ %div.merge.b, %B ], [%cc, %C ]
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-1.ll
similarity index 72%
rename from llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll
rename to llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-1.ll
index e588f6ca95d62..0f4916a87f0aa 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-1.ll
@@ -1,13 +1,5 @@
 ; RUN: opt %s -mtriple amdgcn-- -passes='print<divergence>' -disable-output 2>&1 | FileCheck %s
-
-; NOTE: The new pass manager does not fall back on legacy divergence
-; analysis even when the function contains an irreducible loop. The
-; (new) divergence analysis conservatively reports all values as
-; divergent. This test does not check for this conservative
-; behaviour. Instead, it only checks for the values that are known to
-; be divergent according to the legacy analysis.
-
-; RUN: opt -mtriple amdgcn-- -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
 
 ; This test contains an unstructured loop.
 ;           +-------------- entry ----------------+
@@ -21,21 +13,27 @@
 ;                             |
 ;                             V
 ;                        if (i3 == 5) // divergent
-; because sync dependent on (tid / i3).
+;                                        because sync dependent on (tid / i3).
+
 define i32 @unstructured_loop(i1 %entry_cond) {
-; CHECK-LABEL: Divergence Analysis' for function 'unstructured_loop'
+; CHECK-LABEL: for function 'unstructured_loop'
+; CHECK: DIVERGENT: i1 %entry_cond
+
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2
 loop_entry_1:
+; CHECK: DIVERGENT: %i1 =
   %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
   %j1 = add i32 %i1, 1
   br label %loop_body
 loop_entry_2:
+; CHECK: DIVERGENT: %i2 =
   %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
   %j2 = add i32 %i2, 2
   br label %loop_body
 loop_body:
+; CHECK: DIVERGENT: %i3 =
   %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ]
   br label %loop_latch
 loop_latch:
@@ -43,9 +41,10 @@ loop_latch:
   switch i32 %div, label %branch [ i32 1, label %loop_entry_1
                                    i32 2, label %loop_entry_2 ]
 branch:
+; CHECK: DIVERGENT: %cmp =
+; CHECK: DIVERGENT: br i1 %cmp,
   %cmp = icmp eq i32 %i3, 5
   br i1 %cmp, label %then, label %else
-; CHECK: DIVERGENT: br i1 %cmp,
 then:
   ret i32 0
 else:

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll
new file mode 100644
index 0000000000000..e1e87dec7e0db
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll
@@ -0,0 +1,574 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_kernel void @cycle_diverge_enter(i32 %n, i32 %a, i32 %b) #0 {
+;      entry(div)
+;      /   \
+;     H <-> B
+;           |
+;           X
+; CHECK-LABEL: for function 'cycle_diverge_enter':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %div.cond, label %B, label %H  ; divergent branch
+
+H:
+  %div.merge.h = phi i32 [ 0, %entry ], [ %b, %B ]
+  br label %B
+; CHECK: DIVERGENT: %div.merge.h
+
+B:
+  %div.merge.b = phi i32 [ %a, %H ], [1, %entry ]
+  %div.cond.b = icmp sgt i32 %div.merge.b, 0
+  %div.b.inc = add i32 %b, 1
+  br i1 %div.cond, label %X, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.merge.b
+
+X:
+  %div.use = add i32 %div.merge.b, 1
+  ret void
+; CHECK: DIVERGENT: %div.use =
+
+}
+
+define amdgpu_kernel void @cycle_diverge_exit(i32 %n, i32 %a, i32 %b) #0 {
+;      entry
+;      /   \
+;     H <-> B(div)
+;           |
+;           X
+;
+; CHECK-LABEL: for function 'cycle_diverge_exit':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %B, label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %b, %B ]
+  br label %B
+
+B:
+  %uni.merge.b = phi i32 [ %a, %H ], [1, %entry ]
+  %uni.cond.b = icmp sgt i32 %uni.merge.b, 0
+  %uni.b.inc = add i32 %b, 1
+  br i1 %div.cond, label %X, label %H ; divergent branch
+
+X:
+  %div.use = add i32 %uni.merge.b, 1
+  ret void
+; CHECK: DIVERGENT: %div.use =
+}
+
+define amdgpu_kernel void @cycle_reentrance(i32 %n, i32 %a, i32 %b) #0 {
+; For this case, threads enter the cycle from C would take C->D->H,
+; at the point of H, diverged threads may continue looping in cycle(H-B-D)
+; until all threads exit the cycle(H-B-D) and cause temporal divergence
+; exiting at edge H->C. We currently do not analyze such kind of inner
+; cycle temporal divergence. Instead, we mark all values in the cycle
+; being divergent conservatively.
+;      entry--\
+;       |     |
+;  ---> H(div)|
+;  |   / \    /
+;  |  B   C<--
+;  ^   \ /
+;  \----D
+;       |
+;       X
+; CHECK-LABEL: for function 'cycle_reentrance':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %H, label %C
+
+H:
+  %div.merge.h = phi i32 [ 0, %entry ], [ %b, %D ]
+  br i1 %div.cond, label %B, label %C  ; divergent branch
+
+B:
+  %div.inc.b = add i32 %div.merge.h, 1
+; CHECK: DIVERGENT: %div.inc.b
+  br label %D
+
+C:
+  %div.merge.c = phi i32 [0, %entry], [%div.merge.h, %H]
+  %div.inc.c = add i32 %div.merge.c, 2
+; CHECK: DIVERGENT: %div.inc.c
+  br label %D
+
+D:
+  %div.merge.d = phi i32 [ %div.inc.b, %B ], [ %div.inc.c, %C ]
+; CHECK: DIVERGENT: %div.merge.d
+  br i1 %uni.cond, label %X, label %H
+
+X:
+  ret void
+}
+
+define amdgpu_kernel void @cycle_reentrance2(i32 %n, i32 %a, i32 %b) #0 {
+; This is mostly the same as cycle_reentrance, the only 
diff erence is
+; the successor order, thus 
diff erent dfs visiting order. This is just
+; make sure we are doing uniform analysis correctly under 
diff erent dfs
+; order.
+;      entry--\
+;       |     |
+;  ---> H(div)|
+;  |   / \    /
+;  |  B   C<--
+;  ^   \ /
+;  \----D
+;       |
+;       X
+; CHECK-LABEL: for function 'cycle_reentrance2':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %C, label %H
+
+H:
+  %div.merge.h = phi i32 [ 0, %entry ], [ %b, %D ]
+  br i1 %div.cond, label %B, label %C  ; divergent branch
+
+B:
+  %div.inc.b = add i32 %div.merge.h, 1
+; CHECK: DIVERGENT: %div.inc.b
+  br label %D
+
+C:
+  %div.merge.c = phi i32 [0, %entry], [%div.merge.h, %H]
+  %div.inc.c = add i32 %div.merge.c, 2
+; CHECK: DIVERGENT: %div.inc.c
+  br label %D
+
+D:
+  %div.merge.d = phi i32 [ %div.inc.b, %B ], [ %div.inc.c, %C ]
+; CHECK: DIVERGENT: %div.merge.d
+  br i1 %uni.cond, label %X, label %H
+
+X:
+  ret void
+}
+
+define amdgpu_kernel void @cycle_join_dominated_by_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; the join-node D is dominated by diverge point H2
+;      entry
+;       | |
+;  --> H1 |
+;  |     \|
+;  |      H2(div)
+;  |     / \
+;  |    B   C
+;  ^     \ /
+;  \------D
+;         |
+;         X
+; CHECK-LABEL: for function 'cycle_join_dominated_by_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %H1, label %H2
+
+H1:
+  %uni.merge.h1 = phi i32 [ 0, %entry ], [ %b, %D ]
+  br label %H2
+
+H2:
+  %uni.merge.h2 = phi i32 [ 0, %entry ], [ %b, %H1 ]
+  br i1 %div.cond, label %B, label %C  ; divergent branch
+
+B:
+  %uni.inc.b = add i32 %uni.merge.h2, 1
+  br label %D
+
+C:
+  %uni.inc.c = add i32 %uni.merge.h2, 2
+  br label %D
+
+D:
+  %div.merge.d = phi i32 [ %uni.inc.b, %B ], [ %uni.inc.c, %C ]
+; CHECK: DIVERGENT: %div.merge.d
+  br i1 %uni.cond, label %X, label %H1
+
+X:
+  ret void
+}
+
+define amdgpu_kernel void @cycle_join_dominated_by_entry(i32 %n, i32 %a, i32 %b) #0 {
+; the join-node D is dominated by cycle entry H2
+;      entry
+;       | |
+;  --> H1 |
+;  |     \|
+;  |      H2 -----
+;  |      |      |
+;  |      A(div) |
+;  |     / \     v
+;  |    B   C   /
+;  ^     \ /   /
+;  \------D <-/
+;         |
+;         X
+; CHECK-LABEL: for function 'cycle_join_dominated_by_entry':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %H1, label %H2
+
+H1:
+  %uni.merge.h1 = phi i32 [ 0, %entry ], [ %b, %D ]
+  br label %H2
+
+H2:
+  %uni.merge.h2 = phi i32 [ 0, %entry ], [ %b, %H1 ]
+  br i1 %uni.cond, label %A, label %D
+
+A:
+  br i1 %div.cond, label %B, label %C  ; divergent branch
+
+B:
+  %uni.inc.b = add i32 %uni.merge.h2, 1
+  br label %D
+
+C:
+  %uni.inc.c = add i32 %uni.merge.h2, 2
+  br label %D
+
+D:
+  %div.merge.d = phi i32 [ %uni.inc.b, %B ], [ %uni.inc.c, %C ], [%uni.merge.h2, %H2]
+; CHECK: DIVERGENT: %div.merge.d
+  br i1 %uni.cond, label %X, label %H1
+
+X:
+  ret void
+}
+
+define amdgpu_kernel void @cycle_join_not_dominated(i32 %n, i32 %a, i32 %b) #0 {
+; if H is the header, the sync label propagation may stop at join node D.
+; But join node D is not dominated by divergence starting block C, and also
+; not dominated by any entries(H/C). So we conservatively mark all the values
+; in the cycle divergent for now.
+;      entry
+;       |  |
+;  ---> H  |
+;  |   / \ v
+;  |  B<--C(div)
+;  ^   \ /
+;  \----D
+;       |
+;       X
+; CHECK-LABEL: for function 'cycle_join_not_dominated':
+; CHECK-NOT: DIVERGENT: %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %C, label %H
+
+H:
+  %div.merge.h = phi i32 [ 0, %entry ], [ %b, %D ]
+  br i1 %uni.cond, label %B, label %C
+
+B:
+  %div.merge.b = phi i32 [ 0, %H ], [ %b, %C ]
+  %div.inc.b = add i32 %div.merge.b, 1
+; CHECK: DIVERGENT: %div.inc.b
+  br label %D
+
+C:
+  %div.merge.c = phi i32 [0, %entry], [%div.merge.h, %H]
+  %div.inc.c = add i32 %div.merge.c, 2
+; CHECK: DIVERGENT: %div.inc.c
+  br i1 %div.cond, label %D, label %B  ; divergent branch
+
+D:
+  %div.merge.d = phi i32 [ %div.inc.b, %B ], [ %div.inc.c, %C ]
+; CHECK: DIVERGENT: %div.merge.d
+  br i1 %uni.cond, label %X, label %H
+
+X:
+  ret void
+}
+
+define amdgpu_kernel void @cycle_join_not_dominated2(i32 %n, i32 %a, i32 %b) #0 {
+; This is mostly the same as cycle_join_not_dominated, the only 
diff erence is
+; the dfs visiting order, so the cycle analysis result is 
diff erent.
+;      entry
+;       |  |
+;  ---> H  |
+;  |   / \ v
+;  |  B<--C(div)
+;  ^   \ /
+;  \----D
+;       |
+;       X
+; CHECK-LABEL: for function 'cycle_join_not_dominated2':
+; CHECK-NOT: DIVERGENT: %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %H, label %C
+
+H:
+  %div.merge.h = phi i32 [ 0, %entry ], [ %b, %D ]
+  br i1 %uni.cond, label %B, label %C
+
+B:
+  %div.merge.b = phi i32 [ 0, %H ], [ %b, %C ]
+  %div.inc.b = add i32 %div.merge.b, 1
+; CHECK: DIVERGENT: %div.inc.b
+  br label %D
+
+C:
+  %div.merge.c = phi i32 [0, %entry], [%div.merge.h, %H]
+  %div.inc.c = add i32 %div.merge.c, 2
+; CHECK: DIVERGENT: %div.inc.c
+  br i1 %div.cond, label %D, label %B  ; divergent branch
+
+D:
+  %div.merge.d = phi i32 [ %div.inc.b, %B ], [ %div.inc.c, %C ]
+; CHECK: DIVERGENT: %div.merge.d
+  br i1 %uni.cond, label %X, label %H
+
+X:
+  ret void
+}
+
+define amdgpu_kernel void @natural_loop_two_backedges(i32 %n, i32 %a, i32 %b) #0 {
+; FIXME: the uni.merge.h can be viewed as uniform.
+; CHECK-LABEL: for function 'natural_loop_two_backedges':
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ], [ %uni.inc, %C]
+  %uni.inc = add i32 %uni.merge.h, 1
+  br label %B
+
+B:
+  br i1 %div.cond, label %C, label %H
+
+C:
+  br i1 %uni.cond, label %X, label %H
+
+X:
+  ret void
+}
+
+define amdgpu_kernel void @natural_loop_two_backedges2(i32 %n, i32 %a, i32 %b) #0 {
+; FIXME: the uni.merge.h can be viewed as uniform.
+; CHECK-LABEL: for function 'natural_loop_two_backedges2':
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ], [ %uni.inc, %C]
+  %uni.inc = add i32 %uni.merge.h, 1
+  br i1 %uni.cond, label %B, label %D
+
+B:
+  br i1 %div.cond, label %C, label %H
+
+C:
+  br label %H
+
+D:
+  br i1 %uni.cond, label %B, label %X
+
+X:
+  ret void
+}
+
+define amdgpu_kernel void @cycle_enter_nested(i32 %n, i32 %a, i32 %b) #0 {
+;
+;   entry(div)
+;       |   \
+;   --> H1   |
+;  /    |    |
+;  | -> H2   |
+;  | |  |    /
+;  | \--B <--
+;  ^    |
+;  \----C
+;       |
+;       X
+; CHECK-LABEL: for function 'cycle_enter_nested':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %div.cond, label %B, label %H1
+
+H1:
+  %div.merge.h1 = phi i32 [ 1, %entry ], [ %b, %C ]
+  br label %H2
+; CHECK: DIVERGENT: %div.merge.h1
+
+H2:
+  %div.merge.h2 = phi i32 [ 2, %B ], [ %a, %H1 ]
+; CHECK: DIVERGENT: %div.merge.h2
+  br label %B
+
+B:
+  %div.merge.b = phi i32 [0, %entry], [%a, %H2]
+; CHECK: DIVERGENT: %div.merge.b
+  br i1 %uni.cond, label %C, label %H2
+
+C:
+  br i1 %uni.cond, label %X, label %H1
+
+X:
+  ret void
+}
+
+define amdgpu_kernel void @cycle_inner_exit_enter(i32 %n, i32 %a, i32 %b) #0 {
+;          entry
+;        /      \
+;       E1-> A-> E2
+;       ^    |    \
+;       |    E3-> E4
+;       |    ^   /
+;       |     \ /
+;       C <----B
+;       |
+;       X
+; CHECK-LABEL: for function 'cycle_inner_exit_enter':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %E2, label %E1
+
+E1:
+  %div.merge.e1 = phi i32 [ 1, %entry ], [ %b, %C ]
+  br label %A
+; CHECK: DIVERGENT: %div.merge.e1
+
+A:
+  br i1 %uni.cond, label %E2, label %E3
+
+E2:
+  %div.merge.e2 = phi i32 [ 2, %entry ], [ %a, %A ]
+; CHECK: DIVERGENT: %div.merge.e2
+  br label %E4
+
+E3:
+  %div.merge.e3 = phi i32 [ 0, %A ], [ %b, %B ]
+; CHECK: DIVERGENT: %div.merge.e3
+  br label %E4
+
+E4:
+  %div.merge.e4 = phi i32 [ 0, %E2 ], [ %a, %E3 ]
+; CHECK: DIVERGENT: %div.merge.e4
+  br label %B
+
+B:
+  br i1 %div.cond, label %C, label %E3
+
+C:
+  br i1 %uni.cond, label %X, label %E1
+
+X:
+  ret void
+}
+
+define amdgpu_kernel void @cycle_inner_exit_enter2(i32 %n, i32 %a, i32 %b) #0 {
+; This case is almost the same as cycle_inner_exit_enter, with only 
diff erent
+; dfs visiting order, thus 
diff erent cycle hierarchy.
+;          entry
+;        /      \
+;       E1-> A-> E2
+;       ^    |    \
+;       |    E3-> E4
+;       |    ^   /
+;       |     \ /
+;       C <----B
+;       |
+;       X
+; CHECK-LABEL: for function 'cycle_inner_exit_enter2':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %E1, label %E2
+
+E1:
+  %div.merge.e1 = phi i32 [ 1, %entry ], [ %b, %C ]
+  br label %A
+; CHECK: DIVERGENT: %div.merge.e1
+
+A:
+  br i1 %uni.cond, label %E2, label %E3
+
+E2:
+  %div.merge.e2 = phi i32 [ 2, %entry ], [ %a, %A ]
+; CHECK: DIVERGENT: %div.merge.e2
+  br label %E4
+
+E3:
+  %div.merge.e3 = phi i32 [ 0, %A ], [ %b, %B ]
+; CHECK: DIVERGENT: %div.merge.e3
+  br label %E4
+
+E4:
+  %div.merge.e4 = phi i32 [ 0, %E2 ], [ %a, %E3 ]
+; CHECK: DIVERGENT: %div.merge.e4
+  br label %B
+
+B:
+  br i1 %div.cond, label %C, label %E3
+
+C:
+  br i1 %uni.cond, label %X, label %E1
+
+X:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/reducible-headers.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/reducible-headers.ll
new file mode 100644
index 0000000000000..c64cb315e82fd
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/reducible-headers.ll
@@ -0,0 +1,225 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+;
+;                         Entry
+;                           |
+;                           v
+;                  -------->H---------
+;                  |        |        |
+;                  |        v        |
+;                  |    --->T----    |
+;                  |    |       |    |
+;                  |    |       V    |
+;                  S<---R       P <---
+;                  ^    ^       |
+;                  |    |  Div  |
+;                  |    --- Q <--
+;                  |        |
+;                  |        v
+;                  -------- U
+;                           |
+;                           v
+;                          Exit
+;
+; The divergent branch is at Q that exits an irreducible cycle with
+; entries T and P nested inside a reducible cycle with header H. R is
+; assigned label R, which reaches P. S is a join node with label S. If
+; this is propagated to P via H, then P is incorrectly recognized as a
+; join, making the inner cycle divergent. P is always executed
+; convergently -- either by threads that reconverged at header H, or
+; by threads that are still executing the inner cycle. Thus, any PHI
+; at P should not be marked divergent.
+
+define amdgpu_kernel void @nested_irreducible(i32 %a, i32 %b, i32 %c) {
+; CHECK=LABEL: UniformityInfo for function 'nested_irreducible':
+; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+; CHECK: CYCLES WITH DIVERGENT EXIT:
+; CHECK:   depth=2: entries(P T) R Q
+; CHECK:   depth=1: entries(H) S P T R Q U
+entry:
+  %cond.uni = icmp slt i32 %a, 0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %cond.div = icmp slt i32 %tid, 0
+  br label %H
+
+H:
+ br i1 %cond.uni, label %T, label %P
+
+P:
+; CHECK-LABEL: BLOCK P
+; CHECK-NOT:   DIVERGENT:   %pp.phi =
+; CHECK-NOT: DIVERGENT:      %pp =
+  %pp.phi  = phi i32 [ %a, %H], [ %b, %T ]
+  %pp = add i32 %b, 1
+  br label %Q
+
+Q:
+; CHECK-LABEL: BLOCK Q
+; CHECK-NOT: DIVERGENT:   %qq =
+; CHECK-NOT:   DIVERGENT:   %qq.uni =
+  %qq = add i32 %b, 1
+  %qq.uni = add i32 %pp.phi, 1
+  br i1 %cond.div, label %R, label %U
+
+R:
+  br i1 %cond.uni, label %S, label %T
+
+T:
+; CHECK-LABEL: BLOCK T
+; CHECK-NOT:   DIVERGENT:   %tt.phi =
+; CHECK-NOT: DIVERGENT:     %tt =
+  %tt.phi = phi i32 [ %qq, %R ], [ %a, %H ]
+  %tt = add i32 %b, 1
+  br label %P
+
+S:
+; CHECK-LABEL: BLOCK S
+; CHECK:   DIVERGENT:   %ss.phi =
+; CHECK-NOT: DIVERGENT:     %ss =
+  %ss.phi = phi i32 [ %qq.uni, %U ], [ %a, %R ]
+  %ss = add i32 %b, 1
+  br label %H
+
+U:
+  br i1 %cond.uni, label %S, label %exit
+
+exit:
+; CHECK: DIVERGENT:     %ee.div =
+; CHECK-NOT: DIVERGENT:     %ee =
+  %ee.div =  add i32 %qq.uni, 1
+  %ee = add i32 %b, 1
+  ret void
+}
+
+;
+;                         Entry
+;                           |
+;                           v
+;               -->-------->H---------
+;               |  ^        |        |
+;               |  |        |        |
+;               |  |        |        |
+;               |  |        |        |
+;               |  |        v        V
+;               |  R<-------T-->U--->P
+;               |          Div       |
+;               |                    |
+;               ----------- Q <-------
+;                           |
+;                           v
+;                          Exit
+;
+; This is a reducible cycle with a divergent branch at T. Disjoint
+; paths eventually join at the header H, which is assigned label H.
+; Node P is assigned label U. If the header label were propagated to
+; P, it will be incorrectly recgonized as a join. P is always executed
+; convergently -- either by threads that reconverged at header H, or
+; by threads that diverged at T (and eventually reconverged at H).
+; Thus, any PHI at P should not be marked divergent.
+
+define amdgpu_kernel void @header_label_1(i32 %a, i32 %b, i32 %c) {
+; CHECK=LABEL: UniformityInfo for function 'header_label_1':
+; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+; CHECK: CYCLES WITH DIVERGENT EXIT:
+; CHECK:  depth=1: entries(H) Q P U T R
+entry:
+  %cond.uni = icmp slt i32 %a, 0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %cond.div = icmp slt i32 %tid, 0
+  br label %H
+
+H:
+  br i1 %cond.uni, label %T, label %P
+
+P:
+; CHECK-LABEL: BLOCK P
+; CHECK-NOT:   DIVERGENT:   %pp.phi =
+; CHECK-NOT: DIVERGENT:      %pp =
+  %pp.phi  = phi i32 [ %a, %H], [ %b, %U ]
+  %pp = add i32 %b, 1
+  br label %Q
+
+Q:
+; CHECK-LABEL: BLOCK Q
+; CHECK-NOT: DIVERGENT:   %qq =
+; CHECK-NOT:   DIVERGENT:   %qq.uni =
+  %qq = add i32 %b, 1
+  %qq.uni = add i32 %pp.phi, 1
+  br i1 %cond.uni, label %exit, label %H
+
+R:
+  br label %H
+
+T:
+  br i1 %cond.div, label %R, label %U
+
+U:
+  br label %P
+
+exit:
+; CHECK-LABEL: BLOCK exit
+; CHECK: DIVERGENT:     %ee.div =
+; CHECK-NOT: DIVERGENT:     %ee =
+  %ee.div =  add i32 %qq.uni, 1
+  %ee = add i32 %b, 1
+  ret void
+}
+
+;        entry
+;            |
+;        --> H1
+;        |   | \
+;        |   | H2(div)
+;        |   \ / \
+;        |    B   C
+;        ^     \ /
+;        \------D
+;               |
+;               X
+;
+; This is a reducible cycle with a divergent branch at H2. Disjoint
+; paths eventually join at the header D, which is assigned label D.
+; Node B is assigned label B. If the header label D were propagated to
+; B, it will be incorrectly recgonized as a join. B is always executed
+; convergently -- either by threads that reconverged at header H1, or
+; by threads that diverge at H2 (and eventually reconverged at H1).
+; Thus, any PHI at B should not be marked divergent.
+
+define amdgpu_kernel void @header_label_2(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: UniformityInfo for function 'header_label_2':
+; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
+entry:
+  %cond.uni = icmp slt i32 %a, 0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %cond.div = icmp slt i32 %tid, 0
+  br label %H1
+
+H1:
+  br i1 %cond.uni, label %B, label %H2
+
+H2:
+  br i1 %cond.div, label %B, label %C
+
+B:
+; CHECK-LABEL: BLOCK B
+; CHECK-NOT: DIVERGENT:     %bb.phi =
+  %bb.phi = phi i32 [%a, %H1], [%b, %H2]
+  br label %D
+
+C:
+  br label %D
+
+D:
+; CHECK-LABEL: BLOCK D
+; CHECK: DIVERGENT:     %dd.phi =
+  %dd.phi = phi i32 [%a, %B], [%b, %C]
+  br i1 %cond.uni, label %exit, label %H1
+
+exit:
+  %ee.1 = add i32 %dd.phi, 1
+  %ee.2 = add i32 %b, 1
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/join-at-loop-exit.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/join-at-loop-exit.ll
index ee37bdcf1f629..f7cba8bb30eb5 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/join-at-loop-exit.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/join-at-loop-exit.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
-; CHECK: bb3:
 ; CHECK: DIVERGENT:       %Guard.bb4 = phi i1 [ true, %bb1 ], [ false, %bb2 ]
 ; CHECK: DIVERGENT:       br i1 %Guard.bb4, label %bb4, label %bb5
 

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/join-at-loop-heart.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/join-at-loop-heart.ll
new file mode 100644
index 0000000000000..9e659602333aa
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/join-at-loop-heart.ll
@@ -0,0 +1,44 @@
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+
+; CHECK: DIVERGENT:  %phi.h = phi i32 [ 0, %entry ], [ %inc, %C ], [ %inc, %D ], [ %inc, %E ]
+; CHECK: DIVERGENT:       %tid = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK: DIVERGENT:       %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT:  %inc = add i32 %phi.h, 1
+; CHECK: DIVERGENT:       br i1 %div.cond, label %C, label %D
+
+define void @nested_loop_extension() {
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  br label %A
+
+A:
+  %phi.h = phi i32 [ 0, %entry ], [ %inc, %C ], [ %inc, %D ], [ %inc, %E ]
+  br label %B
+
+B:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %inc = add i32 %phi.h, 1
+  br i1 %div.cond, label %C, label %D
+
+C:
+  br i1 undef, label %A, label %E
+
+D:
+  br i1 undef, label %A, label %E
+
+E:
+  %b = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  br i1 undef, label %A, label %F
+
+F:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+declare token @llvm.experimental.convergence.anchor()
+declare token @llvm.experimental.convergence.loop()
+
+attributes #0 = { nounwind readnone }

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
index 1d02d5c9bb9ff..36e5cde1bd4b7 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
@@ -1,18 +1,19 @@
 ; RUN: opt -mtriple amdgcn-- -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
-; CHECK-LABEL: Divergence Analysis' for function 'test_amdgpu_ps':
-; CHECK: DIVERGENT:  ptr addrspace(4) %arg0
+; CHECK-LABEL: for function 'test_amdgpu_ps':
+; CHECK-DAG: DIVERGENT:  ptr addrspace(4) %arg0
+; CHECK-DAG: DIVERGENT:  <2 x i32> %arg3
+; CHECK-DAG: DIVERGENT:  <3 x i32> %arg4
+; CHECK-DAG: DIVERGENT:  float %arg5
+; CHECK-DAG: DIVERGENT:  i32 %arg6
 ; CHECK-NOT: DIVERGENT
-; CHECK: DIVERGENT:  <2 x i32> %arg3
-; CHECK: DIVERGENT:  <3 x i32> %arg4
-; CHECK: DIVERGENT:  float %arg5
-; CHECK: DIVERGENT:  i32 %arg6
 
 define amdgpu_ps void @test_amdgpu_ps(ptr addrspace(4) byref([4 x <16 x i8>]) %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
   ret void
 }
 
-; CHECK-LABEL: Divergence Analysis' for function 'test_amdgpu_kernel':
+; CHECK-LABEL: for function 'test_amdgpu_kernel':
 ; CHECK-NOT: %arg0
 ; CHECK-NOT: %arg1
 ; CHECK-NOT: %arg2
@@ -24,7 +25,7 @@ define amdgpu_kernel void @test_amdgpu_kernel(ptr addrspace(4) byref([4 x <16 x
   ret void
 }
 
-; CHECK-LABEL: Divergence Analysis' for function 'test_c':
+; CHECK-LABEL: for function 'test_c':
 ; CHECK: DIVERGENT:
 ; CHECK: DIVERGENT:
 ; CHECK: DIVERGENT:

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
index b865eddd6f13a..2e8d352e29786 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple amdgcn-mesa-mesa3d -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-mesa-mesa3d -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(
 define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
index 183f41f932d76..9cf2eec038940 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple amdgcn-mesa-mesa3d -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-mesa-mesa3d -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(
 define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
index 176b8080ef4d7..c8498c4ed52ad 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple amdgcn-- -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 ; CHECK: DIVERGENT:  %tmp5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp2
 ; CHECK: DIVERGENT:  %tmp10 = load volatile float, ptr addrspace(1) %tmp5, align 4

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll
index 03ddd46cfcc31..c7dade2afedc9 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll
@@ -1,14 +1,18 @@
-; RUN: opt -mtriple amdgcn-- -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-- -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LOOPDA
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CYCLEDA
 
 ; CHECK-LABEL: 'test1':
-; CHECK-NEXT: DIVERGENT: i32 %bound
-; CHECK: {{^  *}}%counter =
+; CHECK: DIVERGENT: i32 %bound
+; CYCLEDA: DIVERGENT: %counter =
+; LOOPDA: {{^  *}} %counter =
 ; CHECK-NEXT: DIVERGENT: %break = icmp sge i32 %counter, %bound
-; CHECK-NEXT: DIVERGENT: br i1 %break, label %footer, label %body
-; CHECK: {{^  *}}%counter.next =
-; CHECK: {{^  *}}%counter.footer =
-; CHECK: DIVERGENT: br i1 %break, label %end, label %header
+; CYCLEDA: DIVERGENT: %counter.next =
+; CYCLEDA: DIVERGENT: %counter.footer =
+; LOOPDA: {{^  *}}%counter.next =
+; LOOPDA: {{^  *}}%counter.footer =
+
 ; Note: %counter is not divergent!
+
 define amdgpu_ps void @test1(i32 %bound) {
 entry:
   br label %header

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/propagate-loop-live-out.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/propagate-loop-live-out.ll
index 94662df054ae2..e2a07ae956488 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/propagate-loop-live-out.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/propagate-loop-live-out.ll
@@ -1,8 +1,7 @@
 ; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
-; CHECK: bb6:
 ; CHECK: DIVERGENT:       %.126.i355.i = phi i1 [ false, %bb5 ], [ true, %bb4 ]
-; CHECK: DIVERGENT:       br i1 %.126.i355.i, label %bb7, label %bb8
 
 ; Function Attrs: nounwind readnone speculatable
 declare i32 @llvm.amdgcn.workitem.id.x() #0

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll
index c443d98cf743a..b6e7a32f71eaa 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll
@@ -1,8 +1,9 @@
 ; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 ; temporal-divergent use of value carried by divergent loop
 define amdgpu_kernel void @temporal_diverge(i32 %n, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: Divergence Analysis' for function 'temporal_diverge':
+; CHECK-LABEL: for function 'temporal_diverge':
 ; CHECK-NOT: DIVERGENT: %uni.
 ; CHECK-NOT: DIVERGENT: br i1 %uni.
 
@@ -26,7 +27,7 @@ X:
 
 ; temporal-divergent use of value carried by divergent loop inside a top-level loop
 define amdgpu_kernel void @temporal_diverge_inloop(i32 %n, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: Divergence Analysis' for function 'temporal_diverge_inloop':
+; CHECK-LABEL: for function 'temporal_diverge_inloop':
 ; CHECK-NOT: DIVERGENT: %uni.
 ; CHECK-NOT: DIVERGENT: br i1 %uni.
 
@@ -58,7 +59,7 @@ Y:
 
 ; temporal-uniform use of a valud, definition and users are carried by a surrounding divergent loop
 define amdgpu_kernel void @temporal_uniform_indivloop(i32 %n, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: Divergence Analysis' for function 'temporal_uniform_indivloop':
+; CHECK-LABEL: for function 'temporal_uniform_indivloop':
 ; CHECK-NOT: DIVERGENT: %uni.
 ; CHECK-NOT: DIVERGENT: br i1 %uni.
 
@@ -90,7 +91,7 @@ Y:
 
 ; temporal-divergent use of value carried by divergent loop, user is inside sibling loop
 define amdgpu_kernel void @temporal_diverge_loopuser(i32 %n, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: Divergence Analysis' for function 'temporal_diverge_loopuser':
+; CHECK-LABEL: for function 'temporal_diverge_loopuser':
 ; CHECK-NOT: DIVERGENT: %uni.
 ; CHECK-NOT: DIVERGENT: br i1 %uni.
 
@@ -120,7 +121,7 @@ Y:
 
 ; temporal-divergent use of value carried by divergent loop, user is inside sibling loop, defs and use are carried by a uniform loop
 define amdgpu_kernel void @temporal_diverge_loopuser_nested(i32 %n, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: Divergence Analysis' for function 'temporal_diverge_loopuser_nested':
+; CHECK-LABEL: for function 'temporal_diverge_loopuser_nested':
 ; CHECK-NOT: DIVERGENT: %uni.
 ; CHECK-NOT: DIVERGENT: br i1 %uni.
 

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/trivial-join-at-loop-exit.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/trivial-join-at-loop-exit.ll
index b1f1c3a195b7c..9b32614ca624b 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/trivial-join-at-loop-exit.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/trivial-join-at-loop-exit.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
-; CHECK: bb2:
 ; CHECK-NOT: DIVERGENT:       %Guard.bb2 = phi i1 [ true, %bb1 ], [ false, %bb0 ]
 
 ; Function Attrs: nounwind readnone speculatable

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
index 0c4104a247d67..ef335c04c8b52 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple amdgcn-- -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 ; CHECK: DIVERGENT:  %tmp = cmpxchg volatile
 define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 {

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
index 0d47665d571cc..608e21bbdeab6 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<divergence>' -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll b/llvm/test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll
index d830ae0285dc6..f43e9ff94132d 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll
@@ -1,14 +1,16 @@
 ; RUN: opt %s -passes='print<divergence>' -disable-output 2>&1 | FileCheck %s
+; RUN: opt %s -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
 define i32 @daorder(i32 %n) {
-; CHECK-LABEL: Divergence Analysis' for function 'daorder'
+; CHECK-LABEL: for function 'daorder'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %cond = icmp slt i32 %tid, 0
   br i1 %cond, label %A, label %B ; divergent
+; CHECK:  DIVERGENT: %cond =
 ; CHECK: DIVERGENT: br i1 %cond,
 A:
   %defAtA = add i32 %n, 1 ; uniform

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll b/llvm/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll
index 2a5242b6323a2..a2ac88ca7e2b6 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll
@@ -1,15 +1,17 @@
 ; RUN: opt %s -passes='print<divergence>' -disable-output 2>&1 | FileCheck %s
+; RUN: opt %s -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
 ; return (n < 0 ? a + threadIdx.x : b + threadIdx.x)
 define i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
-; CHECK-LABEL: Divergence Analysis' for function 'no_diverge'
+; CHECK-LABEL: for function 'no_diverge'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %cond = icmp slt i32 %n, 0
   br i1 %cond, label %then, label %else ; uniform
+; CHECK-NOT: DIVERGENT: %cond =
 ; CHECK-NOT: DIVERGENT: br i1 %cond,
 then:
   %a1 = add i32 %a, %tid
@@ -27,11 +29,12 @@ merge:
 ;   c = b;
 ; return c;               // c is divergent: sync dependent
 define i32 @sync(i32 %a, i32 %b) {
-; CHECK-LABEL: Divergence Analysis' for function 'sync'
+; CHECK-LABEL: for function 'sync'
 bb1:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
   %cond = icmp slt i32 %tid, 5
   br i1 %cond, label %bb2, label %bb3
+; CHECK:  DIVERGENT: %cond =
 ; CHECK: DIVERGENT: br i1 %cond,
 bb2:
   br label %bb3
@@ -48,11 +51,12 @@ bb3:
 ; // c here is divergent because it is sync dependent on threadIdx.x >= 5
 ; return c;
 define i32 @mixed(i32 %n, i32 %a, i32 %b) {
-; CHECK-LABEL: Divergence Analysis' for function 'mixed'
+; CHECK-LABEL: for function 'mixed'
 bb1:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
   %cond = icmp slt i32 %tid, 5
   br i1 %cond, label %bb6, label %bb2
+; CHECK:  DIVERGENT: %cond =
 ; CHECK: DIVERGENT: br i1 %cond,
 bb2:
   %cond2 = icmp slt i32 %n, 0
@@ -73,13 +77,14 @@ bb6:
 
 ; We conservatively treats all parameters of a __device__ function as divergent.
 define i32 @device(i32 %n, i32 %a, i32 %b) {
-; CHECK-LABEL: Divergence Analysis' for function 'device'
-; CHECK: DIVERGENT: i32 %n
-; CHECK: DIVERGENT: i32 %a
-; CHECK: DIVERGENT: i32 %b
+; CHECK-LABEL: for function 'device'
+; CHECK-DAG: DIVERGENT: i32 %n
+; CHECK-DAG: DIVERGENT: i32 %a
+; CHECK-DAG: DIVERGENT: i32 %b
 entry:
   %cond = icmp slt i32 %n, 0
   br i1 %cond, label %then, label %else
+; CHECK:  DIVERGENT: %cond =
 ; CHECK: DIVERGENT: br i1 %cond,
 then:
   br label %merge
@@ -98,7 +103,7 @@ merge:
 ;
 ; The i defined in the loop is used outside.
 define i32 @loop() {
-; CHECK-LABEL: Divergence Analysis' for function 'loop'
+; CHECK-LABEL: for function 'loop'
 entry:
   %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
   br label %loop
@@ -111,6 +116,7 @@ loop:
 loop_exit:
   %cond = icmp eq i32 %i, 10
   br i1 %cond, label %then, label %else
+; CHECK:  DIVERGENT: %cond =
 ; CHECK: DIVERGENT: br i1 %cond,
 then:
   ret i32 0
@@ -120,7 +126,7 @@ else:
 
 ; Same as @loop, but the loop is in the LCSSA form.
 define i32 @lcssa() {
-; CHECK-LABEL: Divergence Analysis' for function 'lcssa'
+; CHECK-LABEL: for function 'lcssa'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   br label %loop
@@ -135,6 +141,7 @@ loop_exit:
 ; CHECK: DIVERGENT: %i.lcssa =
   %cond = icmp eq i32 %i.lcssa, 10
   br i1 %cond, label %then, label %else
+; CHECK:  DIVERGENT: %cond =
 ; CHECK: DIVERGENT: br i1 %cond,
 then:
   ret i32 0
@@ -144,6 +151,7 @@ else:
 
 ; Verifies sync-dependence is computed correctly in the absense of loops.
 define i32 @sync_no_loop(i32 %arg) {
+; CHECK-LABEL: for function 'sync_no_loop'
 entry:
   %0 = add i32 %arg, 1
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll b/llvm/test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll
index bb12f8b212809..54329e099d42c 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll
@@ -1,18 +1,21 @@
 ; RUN: opt %s -passes='print<divergence>' -disable-output 2>&1 | FileCheck %s
+; RUN: opt %s -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
 define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) {
-; CHECK-LABEL: Divergence Analysis' for function 'hidden_diverge'
+; CHECK-LABEL: for function 'hidden_diverge'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %cond.var = icmp slt i32 %tid, 0
   br i1 %cond.var, label %B, label %C ; divergent
+; CHECK:  DIVERGENT: %cond.var =
 ; CHECK: DIVERGENT: br i1 %cond.var,
 B:
   %cond.uni = icmp slt i32 %n, 0
   br i1 %cond.uni, label %C, label %merge ; uniform
+; CHECK-NOT: DIVERGENT: %cond.uni =
 ; CHECK-NOT: DIVERGENT: br i1 %cond.uni,
 C:
   %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B  ]

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll b/llvm/test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll
index b81045dfb4a03..5cfecbd12f4ae 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -passes='print<divergence>' -disable-output 2>&1 | FileCheck %s
+; RUN: opt %s -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
 
 ; NOTE: The new pass manager does not fall back on legacy divergence
 ; analysis even when the function contains an irreducible loop. The
@@ -24,7 +25,7 @@ target triple = "nvptx64-nvidia-cuda"
 ;                        if (i3 == 5) // divergent
 ; because sync dependent on (tid / i3).
 define i32 @unstructured_loop(i1 %entry_cond) {
-; CHECK-LABEL: Divergence Analysis' for function 'unstructured_loop'
+; CHECK-LABEL: for function 'unstructured_loop'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2
@@ -46,6 +47,7 @@ loop_latch:
 branch:
   %cmp = icmp eq i32 %i3, 5
   br i1 %cmp, label %then, label %else
+; CHECK:  DIVERGENT: %cmp =
 ; CHECK: DIVERGENT: br i1 %cmp,
 then:
   ret i32 0