[llvm] [AMDGPU] Eliminate likely-spurious execz checks (PR #117567)

Fabian Ritter via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 3 01:09:34 PST 2025


https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/117567

>From 3870adfa8ff6fc31e1efcfbfb120740ca17461ee Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Mon, 25 Nov 2024 09:44:23 -0500
Subject: [PATCH 1/4] [AMDGPU] Eliminate likely-spurious execz checks

Currently, we introduce branches to skip conditionally executed
instructions if the EXEC mask is zero and only eliminate them if the
scheduling model says that executing the skipped instructions is cheaper
than taking the branch instruction.

This patch adds branch weights to branches where the threads of a
wavefront are likely (according to a heuristic) to have dynamically
varying values for the branch condition. This causes SIPreEmitPeephole
to eliminate the corresponding execz branch.

Currently, this is implemented as a new middle end pass with a rather
optimistic heuristic, to gather initial feedback.
---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  15 ++
 .../AMDGPUAnnotateVaryingBranchWeights.cpp    | 201 ++++++++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   3 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   4 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 .../conditional-mem-no-cbranch-execz.ll       |  66 ++++++
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   5 +
 ...ne-sink-temporal-divergence-swdev407790.ll | 176 ++++++++-------
 .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn |   1 +
 9 files changed, 380 insertions(+), 92 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 95d0ad0f9dc96a..e7914d1de4a8fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -380,6 +380,10 @@ FunctionPass *createAMDGPURewriteUndefForPHILegacyPass();
 void initializeAMDGPURewriteUndefForPHILegacyPass(PassRegistry &);
 extern char &AMDGPURewriteUndefForPHILegacyPassID;
 
+FunctionPass *createAMDGPUAnnotateVaryingBranchWeightsLegacyPass();
+void initializeAMDGPUAnnotateVaryingBranchWeightsLegacyPass(PassRegistry &);
+extern char &AMDGPUAnnotateVaryingBranchWeightsLegacyPassID;
+
 class AMDGPURewriteUndefForPHIPass
     : public PassInfoMixin<AMDGPURewriteUndefForPHIPass> {
 public:
@@ -397,6 +401,17 @@ class SIAnnotateControlFlowPass
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+class AMDGPUAnnotateVaryingBranchWeightsPass
+    : public PassInfoMixin<AMDGPUAnnotateVaryingBranchWeightsPass> {
+private:
+  const AMDGPUTargetMachine &TM;
+
+public:
+  AMDGPUAnnotateVaryingBranchWeightsPass(const AMDGPUTargetMachine &TM)
+      : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 void initializeSIAnnotateControlFlowLegacyPass(PassRegistry &);
 extern char &SIAnnotateControlFlowLegacyPassID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
new file mode 100644
index 00000000000000..3c637290cbdbe3
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
@@ -0,0 +1,201 @@
+//===- AMDGPUAnnotateVaryingBranchWeights.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Estimate if conditional branches for which SIAnnotateControlFlow introduced
+// amdgcn_if or amdgcn_else intrinsics are likely to have different outcomes for
+// the threads of each wavefront. If that is the case, BranchWeight metadata is
+// added to signal that "then" and "else" blocks are both likely to be executed.
+// This may introduce branch weights that would be self-contradictory in a
+// non-SIMT setting.
+//
+// A consequence of this is that SIPreEmitPeephole is more likely to eliminate
+// s_cbranch_execz instructions that were introduced to skip these blocks when
+// no thread in the wavefront is active for them.
+//
+// Should only run after SIAnnotateControlFlow.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-annotate-varying-branch-weights"
+
+namespace {
+
+class AMDGPUAnnotateVaryingBranchWeightsImpl {
+public:
+  AMDGPUAnnotateVaryingBranchWeightsImpl() = delete;
+  AMDGPUAnnotateVaryingBranchWeightsImpl(const GCNSubtarget &ST,
+                                         const TargetTransformInfo &TTI)
+      : ST(ST), TTI(TTI) {
+    // Determine weights that signal that a branch is very likely to be
+    // predicted correctly, i.e., whose ratio exceeds
+    // TTI.getPredictableBranchThreshold().
+    auto BranchProbThreshold = TTI.getPredictableBranchThreshold();
+    LikelyWeight = BranchProbThreshold.getNumerator();
+    UnlikelyWeight = BranchProbThreshold.getDenominator() - LikelyWeight;
+    if (UnlikelyWeight > 0)
+      --UnlikelyWeight;
+  }
+
+  bool run(Function &F);
+
+private:
+  const GCNSubtarget &ST;
+  const TargetTransformInfo &TTI;
+  uint32_t LikelyWeight;
+  uint32_t UnlikelyWeight;
+  ValueMap<const Value *, bool> LikelyVaryingCache;
+
+  /// Heuristically check if it is likely that a wavefront has dynamically
+  /// varying values for V.
+  bool isLikelyVarying(const Value *V);
+
+  /// Set branch weights that signal that the "true" successor of Term is the
+  /// likely destination, if no prior weights are present.
+  /// Return true if weights were set.
+  bool setTrueSuccessorLikely(BranchInst *Term);
+};
+
+class AMDGPUAnnotateVaryingBranchWeightsLegacy : public FunctionPass {
+public:
+  static char ID;
+  AMDGPUAnnotateVaryingBranchWeightsLegacy() : FunctionPass(ID) {
+    initializeAMDGPUAnnotateVaryingBranchWeightsLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "AMDGPU Annotate Varying Branch Weights";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesCFG();
+  }
+
+  bool runOnFunction(Function &F) override {
+    TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+    const TargetMachine &TM = TPC.getTM<TargetMachine>();
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+    const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
+    return AMDGPUAnnotateVaryingBranchWeightsImpl(ST, TTI).run(F);
+  }
+};
+
+} // end anonymous namespace
+
+char AMDGPUAnnotateVaryingBranchWeightsLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateVaryingBranchWeightsLegacy, DEBUG_TYPE,
+                      "Annotate Varying Branch Weights", false, false)
+INITIALIZE_PASS_END(AMDGPUAnnotateVaryingBranchWeightsLegacy, DEBUG_TYPE,
+                    "Annotate Varying Branch Weights", false, false)
+
+FunctionPass *llvm::createAMDGPUAnnotateVaryingBranchWeightsLegacyPass() {
+  return new AMDGPUAnnotateVaryingBranchWeightsLegacy();
+}
+
+PreservedAnalyses
+AMDGPUAnnotateVaryingBranchWeightsPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
+  bool Changed = AMDGPUAnnotateVaryingBranchWeightsImpl(ST, TTI).run(F);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+bool AMDGPUAnnotateVaryingBranchWeightsImpl::isLikelyVarying(const Value *V) {
+  // Check if V is a source of divergence or if it transitively uses one.
+  if (TTI.isSourceOfDivergence(V))
+    return true;
+
+  auto *U = dyn_cast<User>(V);
+  if (!U)
+    return false;
+
+  // Have we already checked V?
+  auto CacheEntry = LikelyVaryingCache.find(V);
+  if (CacheEntry != LikelyVaryingCache.end())
+    return CacheEntry->second;
+
+  // Does it use a likely varying Value?
+  bool Result = false;
+  for (const auto &Use : U->operands()) {
+    Result |= isLikelyVarying(Use);
+    if (Result)
+      break;
+  }
+
+  LikelyVaryingCache.insert({V, Result});
+  return Result;
+}
+
+bool AMDGPUAnnotateVaryingBranchWeightsImpl::setTrueSuccessorLikely(
+    BranchInst *Term) {
+  assert(Term->isConditional());
+
+  // Don't overwrite existing branch weights.
+  if (hasProfMD(*Term))
+    return false;
+
+  llvm::setBranchWeights(*Term, {LikelyWeight, UnlikelyWeight}, false);
+  LLVM_DEBUG(dbgs() << "Added branch weights: " << *Term << '\n');
+  return true;
+}
+
+bool AMDGPUAnnotateVaryingBranchWeightsImpl::run(Function &F) {
+  // If the workgroup has only a single thread, the condition cannot vary.
+  const auto WGSizes = ST.getFlatWorkGroupSizes(F);
+  if (WGSizes.first <= 1)
+    return false;
+
+  using namespace PatternMatch;
+
+  bool Changed = false;
+  for (auto &BB : F) {
+    auto *Term = BB.getTerminator();
+    // Look for conditional branches whose condition is an ExtractValueInst
+    // that extracts the return value of a call to the amdgcn_if or amdgcn_else
+    // intrinsic.
+    if (match(Term, m_Br(m_ExtractValue<0>(m_CombineOr(
+                             m_Intrinsic<Intrinsic::amdgcn_if>(),
+                             m_Intrinsic<Intrinsic::amdgcn_else>())),
+                         m_Value(), m_Value()))) {
+      // The this condition is an artificial value resulting from the control
+      // flow intrinsic, not the actual branch condition. However, the
+      // intrinsics connect it via data flow with the actual condition
+      // (even for the amdgcn_else intrinsic, via the matching amdgcn_if
+      // intrinsic), so isLikelyVarying still produces meaningful results.
+      if (isLikelyVarying(cast<BranchInst>(Term)->getCondition()))
+        Changed |= setTrueSuccessorLikely(cast<BranchInst>(Term));
+    }
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 174a90f0aa419d..63a7b0a50c4455 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -65,6 +65,9 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
               AMDGPUUnifyDivergentExitNodesPass())
 FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
 FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("amdgpu-annotate-varying-branch-weights",
+              AMDGPUAnnotateVaryingBranchWeightsPass(
+                  *static_cast<const GCNTargetMachine *>(this)))
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index da18f2b20f1427..325db6ca9b7ace 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -501,6 +501,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUReserveWWMRegsPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
+  initializeAMDGPUAnnotateVaryingBranchWeightsLegacyPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowLegacyPass(*PR);
   initializeAMDGPUInsertDelayAluPass(*PR);
@@ -1315,6 +1316,7 @@ bool GCNPassConfig::addPreISel() {
   // analysis. This depends on stopping SIAnnotateControlFlow from making
   // control flow modifications.
   addPass(createAMDGPURewriteUndefForPHILegacyPass());
+  addPass(createAMDGPUAnnotateVaryingBranchWeightsLegacyPass());
 
   addPass(createLCSSAPass());
 
@@ -2003,6 +2005,8 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
   // control flow modifications.
   addPass(AMDGPURewriteUndefForPHIPass());
 
+  addPass(AMDGPUAnnotateVaryingBranchWeightsPass(TM));
+
   addPass(LCSSAPass());
 
   if (TM.getOptLevel() > CodeGenOptLevel::Less)
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index fed29c3e14aae2..7e85770c70d5ff 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
+  AMDGPUAnnotateVaryingBranchWeights.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll b/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
new file mode 100644
index 00000000000000..f5f4c0a12eaeee
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+sramecc,-xnack < %s | FileCheck %s
+
+; Check that simple conditional memory accesses that are guarded by likely
+; varying conditions are not lowered with an s_cbranch_execz to bypass them.
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+
+define amdgpu_kernel void @cond_ops(ptr addrspace(1) inreg %x, ptr addrspace(1) inreg %y) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: cond_ops:
+; CHECK:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; CHECK-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; CHECK-NEXT:    v_lshl_or_b32 v5, v0, 6, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 4, v5
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 4, v5
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT:  ; %bb.1: ; %do.load
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[8:9]
+; CHECK-NEXT:  ; %bb.2: ; %post.load
+; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v5
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v5
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT:    s_cbranch_execz .LBB0_4
+; CHECK-NEXT:  ; %bb.3: ; %do.store
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[10:11]
+; CHECK-NEXT:  .LBB0_4: ; %exit
+; CHECK-NEXT:    s_endpgm
+entry:
+  %tid.x = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %k = lshr i32 %tid, 4
+  %j = and i32 %tid, 15
+  %load.cond = icmp ult i32 %k, 15
+  %tid.ext = zext nneg i32 %tid to i64
+  %my.x = getelementptr <4 x float>, ptr addrspace(1) %x, i64 %tid.ext
+  br i1 %load.cond, label %do.load, label %post.load
+do.load:
+  %loaded = load <4 x float>, ptr addrspace(1) %my.x
+  br label %post.load
+post.load:
+  %maybe.loaded = phi <4 x float> [ %loaded, %do.load ], [ zeroinitializer, %entry ]
+  %my.y = getelementptr <4 x float>, ptr addrspace(1) %y, i64 %tid.ext
+  %store.cond = icmp ult i32 %j, 15
+  br i1 %store.cond, label %do.store, label %exit
+do.store:
+  store <4 x float> %maybe.loaded, ptr addrspace(1) %my.y
+  br label %exit
+exit:
+  ret void
+}
+
+attributes #0 = {"uniform-work-group-size"="true" "amdgpu-flat-work-group-size"="256,256"}
+!0 = !{i32 64, i32 4, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index e77f4f69e265bb..65bb9a9652b17e 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -87,6 +87,7 @@
 ; GCN-O0-NEXT:        Cycle Info Analysis
 ; GCN-O0-NEXT:        Uniformity Analysis
 ; GCN-O0-NEXT:        AMDGPU Rewrite Undef for PHI
+; GCN-O0-NEXT:        AMDGPU Annotate Varying Branch Weights
 ; GCN-O0-NEXT:        LCSSA Verifier
 ; GCN-O0-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O0-NEXT:      DummyCGSCCPass
@@ -279,6 +280,7 @@
 ; GCN-O1-NEXT:        Cycle Info Analysis
 ; GCN-O1-NEXT:        Uniformity Analysis
 ; GCN-O1-NEXT:        AMDGPU Rewrite Undef for PHI
+; GCN-O1-NEXT:        AMDGPU Annotate Varying Branch Weights
 ; GCN-O1-NEXT:        LCSSA Verifier
 ; GCN-O1-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O1-NEXT:      DummyCGSCCPass
@@ -579,6 +581,7 @@
 ; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
 ; GCN-O1-OPTS-NEXT:        Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:        AMDGPU Rewrite Undef for PHI
+; GCN-O1-OPTS-NEXT:        AMDGPU Annotate Varying Branch Weights
 ; GCN-O1-OPTS-NEXT:        LCSSA Verifier
 ; GCN-O1-OPTS-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O1-OPTS-NEXT:      DummyCGSCCPass
@@ -891,6 +894,7 @@
 ; GCN-O2-NEXT:        Cycle Info Analysis
 ; GCN-O2-NEXT:        Uniformity Analysis
 ; GCN-O2-NEXT:        AMDGPU Rewrite Undef for PHI
+; GCN-O2-NEXT:        AMDGPU Annotate Varying Branch Weights
 ; GCN-O2-NEXT:        LCSSA Verifier
 ; GCN-O2-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O2-NEXT:      Analysis if a function is memory bound
@@ -1218,6 +1222,7 @@
 ; GCN-O3-NEXT:        Cycle Info Analysis
 ; GCN-O3-NEXT:        Uniformity Analysis
 ; GCN-O3-NEXT:        AMDGPU Rewrite Undef for PHI
+; GCN-O3-NEXT:        AMDGPU Annotate Varying Branch Weights
 ; GCN-O3-NEXT:        LCSSA Verifier
 ; GCN-O3-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O3-NEXT:      Analysis if a function is memory bound
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index c826980991f94f..5db6dd8d64d283 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -146,8 +146,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_mov_b32_e32 v47, 0
 ; CHECK-NEXT:    s_mov_b32 s47, 0
 ; CHECK-NEXT:  .LBB0_5: ; =>This Loop Header: Depth=1
-; CHECK-NEXT:    ; Child Loop BB0_8 Depth 2
-; CHECK-NEXT:    ; Child Loop BB0_20 Depth 2
+; CHECK-NEXT:    ; Child Loop BB0_7 Depth 2
+; CHECK-NEXT:    ; Child Loop BB0_19 Depth 2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, s47, v44
 ; CHECK-NEXT:    s_lshl_b32 s4, s47, 5
 ; CHECK-NEXT:    s_add_i32 s46, s47, 1
@@ -163,20 +163,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_mov_b32 s53, 0
 ; CHECK-NEXT:    s_mov_b32 s56, 0
-; CHECK-NEXT:    s_branch .LBB0_8
-; CHECK-NEXT:  .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
-; CHECK-NEXT:    s_add_i32 s56, s56, 4
-; CHECK-NEXT:    s_add_i32 s4, s47, s56
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, s56, v57
-; CHECK-NEXT:    s_add_i32 s5, s4, 5
-; CHECK-NEXT:    s_add_i32 s4, s4, 1
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s5, v42
-; CHECK-NEXT:    v_mov_b32_e32 v58, s4
-; CHECK-NEXT:    s_or_b32 s53, vcc_lo, s53
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s53
-; CHECK-NEXT:    s_cbranch_execz .LBB0_16
-; CHECK-NEXT:  .LBB0_8: ; Parent Loop BB0_5 Depth=1
+; CHECK-NEXT:  .LBB0_7: ; Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v59, s56, v46
 ; CHECK-NEXT:    v_add_nc_u32_e32 v58, s56, v57
@@ -184,8 +171,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_10
-; CHECK-NEXT:  ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_9
+; CHECK-NEXT:  ; %bb.8: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -203,14 +190,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v58
-; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:  .LBB0_9: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_12
-; CHECK-NEXT:  ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_11
+; CHECK-NEXT:  ; %bb.10: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -229,14 +216,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v60
-; CHECK-NEXT:  .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:  .LBB0_11: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_14
-; CHECK-NEXT:  ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_13
+; CHECK-NEXT:  ; %bb.12: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -255,14 +242,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v60
-; CHECK-NEXT:  .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:  .LBB0_13: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_7
-; CHECK-NEXT:  ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_15
+; CHECK-NEXT:  ; %bb.14: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -281,8 +268,19 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v58
-; CHECK-NEXT:    s_branch .LBB0_7
-; CHECK-NEXT:  .LBB0_16: ; %Flow45
+; CHECK-NEXT:  .LBB0_15: ; in Loop: Header=BB0_7 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT:    s_add_i32 s56, s56, 4
+; CHECK-NEXT:    s_add_i32 s4, s47, s56
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, s56, v57
+; CHECK-NEXT:    s_add_i32 s5, s4, 5
+; CHECK-NEXT:    s_add_i32 s4, s4, 1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s5, v42
+; CHECK-NEXT:    v_mov_b32_e32 v58, s4
+; CHECK-NEXT:    s_or_b32 s53, vcc_lo, s53
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s53
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_7
+; CHECK-NEXT:  ; %bb.16: ; %Flow45
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
 ; CHECK-NEXT:    v_mov_b32_e32 v57, v0
@@ -296,25 +294,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_mov_b32 s52, 0
 ; CHECK-NEXT:    s_inst_prefetch 0x1
-; CHECK-NEXT:    s_branch .LBB0_20
 ; CHECK-NEXT:    .p2align 6
-; CHECK-NEXT:  .LBB0_19: ; in Loop: Header=BB0_20 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
-; CHECK-NEXT:    v_add_nc_u32_e32 v58, 1, v58
-; CHECK-NEXT:    v_add_nc_u32_e32 v57, 1, v57
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v58, v42
-; CHECK-NEXT:    s_or_b32 s52, vcc_lo, s52
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s52
-; CHECK-NEXT:    s_cbranch_execz .LBB0_22
-; CHECK-NEXT:  .LBB0_20: ; Parent Loop BB0_5 Depth=1
+; CHECK-NEXT:  .LBB0_19: ; Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v58
 ; CHECK-NEXT:    ds_read_u8 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s53, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_19
-; CHECK-NEXT:  ; %bb.21: ; in Loop: Header=BB0_20 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_21
+; CHECK-NEXT:  ; %bb.20: ; in Loop: Header=BB0_19 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -332,8 +321,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v57
-; CHECK-NEXT:    s_branch .LBB0_19
-; CHECK-NEXT:  .LBB0_22: ; %Flow43
+; CHECK-NEXT:  .LBB0_21: ; in Loop: Header=BB0_19 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
+; CHECK-NEXT:    v_add_nc_u32_e32 v58, 1, v58
+; CHECK-NEXT:    v_add_nc_u32_e32 v57, 1, v57
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v58, v42
+; CHECK-NEXT:    s_or_b32 s52, vcc_lo, s52
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s52
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_19
+; CHECK-NEXT:  ; %bb.22: ; %Flow43
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_inst_prefetch 0x2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s52
@@ -374,29 +370,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_33
 ; CHECK-NEXT:  ; %bb.26:
 ; CHECK-NEXT:    s_mov_b32 s44, 0
-; CHECK-NEXT:    s_branch .LBB0_28
-; CHECK-NEXT:  .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s45
-; CHECK-NEXT:    v_mov_b32_e32 v31, v40
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_add_u32 s8, s34, 40
-; CHECK-NEXT:    s_addc_u32 s9, s35, 0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
-; CHECK-NEXT:    s_mov_b32 s14, s33
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, _Z14get_local_sizej at rel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, _Z14get_local_sizej at rel32@hi+12
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_add_co_u32 v41, vcc_lo, v0, v41
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v47, v41
-; CHECK-NEXT:    s_or_b32 s44, vcc_lo, s44
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; CHECK-NEXT:    s_cbranch_execz .LBB0_33
-; CHECK-NEXT:  .LBB0_28: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:  .LBB0_27: ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v41
 ; CHECK-NEXT:    s_mov_b32 s45, exec_lo
 ; CHECK-NEXT:    ds_read_b32 v0, v0
@@ -424,8 +398,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_or_b32_e32 v5, v46, v57
 ; CHECK-NEXT:    v_or_b32_e32 v4, v45, v56
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
-; CHECK-NEXT:    s_cbranch_execz .LBB0_27
-; CHECK-NEXT:  ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_32
+; CHECK-NEXT:  ; %bb.28: ; in Loop: Header=BB0_27 Depth=1
 ; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    global_load_dwordx2 v[58:59], v[2:3], off offset:24
 ; CHECK-NEXT:    global_load_dwordx2 v[60:61], v[0:1], off offset:24
@@ -465,8 +439,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_mov_b32 s4, exec_lo
 ; CHECK-NEXT:    v_cmpx_gt_u32_e32 12, v0
 ; CHECK-NEXT:    s_xor_b32 s4, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_31
-; CHECK-NEXT:  ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_30
+; CHECK-NEXT:  ; %bb.29: ; in Loop: Header=BB0_27 Depth=1
 ; CHECK-NEXT:    v_xor_b32_e32 v4, v60, v58
 ; CHECK-NEXT:    v_lshrrev_b64 v[2:3], 16, v[56:57]
 ; CHECK-NEXT:    v_mad_u64_u32 v[6:7], null, 0x180, v73, s[50:51]
@@ -489,11 +463,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    global_store_dword v[6:7], v8, off offset:4
 ; CHECK-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off offset:8
 ; CHECK-NEXT:    global_store_dwordx2 v[6:7], v[4:5], off offset:24
-; CHECK-NEXT:  .LBB0_31: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:  .LBB0_30: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_27 Depth=1
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s4, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_27
-; CHECK-NEXT:  ; %bb.32: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_32
+; CHECK-NEXT:  ; %bb.31: ; in Loop: Header=BB0_27 Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v42
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v43
@@ -510,7 +484,27 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z10atomic_subPU3AS1Vjj at rel32@hi+12
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    s_branch .LBB0_27
+; CHECK-NEXT:  .LBB0_32: ; in Loop: Header=BB0_27 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s45
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s43
+; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, _Z14get_local_sizej at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, _Z14get_local_sizej at rel32@hi+12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_add_co_u32 v41, vcc_lo, v0, v41
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v47, v41
+; CHECK-NEXT:    s_or_b32 s44, vcc_lo, s44
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_27
 ; CHECK-NEXT:  .LBB0_33:
 ; CHECK-NEXT:    s_endpgm
   %6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
@@ -875,7 +869,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:  .LBB1_1: ; %.37
 ; CHECK-NEXT:    ; =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ; Child Loop BB1_3 Depth 2
-; CHECK-NEXT:    ; Child Loop BB1_8 Depth 2
+; CHECK-NEXT:    ; Child Loop BB1_7 Depth 2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, s4, v44
 ; CHECK-NEXT:    s_lshl_b32 s5, s4, 5
 ; CHECK-NEXT:    s_add_i32 s45, s4, 1
@@ -919,18 +913,8 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_mov_b32 s47, 0
 ; CHECK-NEXT:    s_inst_prefetch 0x1
-; CHECK-NEXT:    s_branch .LBB1_8
 ; CHECK-NEXT:    .p2align 6
-; CHECK-NEXT:  .LBB1_7: ; %.114
-; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s48
-; CHECK-NEXT:    v_add_nc_u32_e32 v56, 1, v56
-; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v56, v41
-; CHECK-NEXT:    s_or_b32 s47, vcc_lo, s47
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s47
-; CHECK-NEXT:    s_cbranch_execz .LBB1_10
-; CHECK-NEXT:  .LBB1_8: ; %.103
+; CHECK-NEXT:  .LBB1_7: ; %.103
 ; CHECK-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v56
@@ -938,9 +922,9 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s48, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB1_7
-; CHECK-NEXT:  ; %bb.9: ; %.110
-; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB1_9
+; CHECK-NEXT:  ; %bb.8: ; %.110
+; CHECK-NEXT:    ; in Loop: Header=BB1_7 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s38, 40
@@ -958,8 +942,16 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v47
-; CHECK-NEXT:    s_branch .LBB1_7
-; CHECK-NEXT:  .LBB1_10: ; %Flow
+; CHECK-NEXT:  .LBB1_9: ; %.114
+; CHECK-NEXT:    ; in Loop: Header=BB1_7 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s48
+; CHECK-NEXT:    v_add_nc_u32_e32 v56, 1, v56
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v56, v41
+; CHECK-NEXT:    s_or_b32 s47, vcc_lo, s47
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s47
+; CHECK-NEXT:    s_cbranch_execnz .LBB1_7
+; CHECK-NEXT:  ; %bb.10: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_inst_prefetch 0x2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s47
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index d122e8a21671be..2970b820dcd7d1 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -142,6 +142,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUCodeGenPrepare.cpp",
     "AMDGPUCombinerHelper.cpp",
     "AMDGPUCtorDtorLowering.cpp",
+    "AMDGPUEXECValueAnalysis.cpp",
     "AMDGPUExportClustering.cpp",
     "AMDGPUFrameLowering.cpp",
     "AMDGPUGlobalISelDivergenceLowering.cpp",

>From 345269b1523b1c2af48dc3798519031bf30d218d Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Wed, 27 Nov 2024 03:50:54 -0500
Subject: [PATCH 2/4] fixup! [AMDGPU] Eliminate likely-spurious execz checks

First round of reviewer feedback: do not match intrinsics to identify candidate branches, preserve alphabetic order of the pass registry, rename TTI->GCNTTI, update test
---
 .../AMDGPUAnnotateVaryingBranchWeights.cpp    | 58 +++++++++----------
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |  6 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  8 +--
 .../conditional-mem-no-cbranch-execz.ll       | 58 ++++++++++++++++++-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      | 10 ++--
 ...ne-sink-temporal-divergence-swdev407790.ll |  2 -
 6 files changed, 96 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
index 3c637290cbdbe3..b67e9bde06e1a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
@@ -22,14 +22,11 @@
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
-#include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Analysis.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ProfDataUtils.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
@@ -45,12 +42,13 @@ class AMDGPUAnnotateVaryingBranchWeightsImpl {
 public:
   AMDGPUAnnotateVaryingBranchWeightsImpl() = delete;
   AMDGPUAnnotateVaryingBranchWeightsImpl(const GCNSubtarget &ST,
-                                         const TargetTransformInfo &TTI)
-      : ST(ST), TTI(TTI) {
+                                         const TargetTransformInfo &GCNTTI,
+                                         UniformityInfo &UA)
+      : ST(ST), GCNTTI(GCNTTI), UA(UA) {
     // Determine weights that signal that a branch is very likely to be
     // predicted correctly, i.e., whose ratio exceeds
     // TTI.getPredictableBranchThreshold().
-    auto BranchProbThreshold = TTI.getPredictableBranchThreshold();
+    auto BranchProbThreshold = GCNTTI.getPredictableBranchThreshold();
     LikelyWeight = BranchProbThreshold.getNumerator();
     UnlikelyWeight = BranchProbThreshold.getDenominator() - LikelyWeight;
     if (UnlikelyWeight > 0)
@@ -61,7 +59,8 @@ class AMDGPUAnnotateVaryingBranchWeightsImpl {
 
 private:
   const GCNSubtarget &ST;
-  const TargetTransformInfo &TTI;
+  const TargetTransformInfo &GCNTTI;
+  const UniformityInfo &UA;
   uint32_t LikelyWeight;
   uint32_t UnlikelyWeight;
   ValueMap<const Value *, bool> LikelyVaryingCache;
@@ -89,16 +88,21 @@ class AMDGPUAnnotateVaryingBranchWeightsLegacy : public FunctionPass {
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<UniformityInfoWrapperPass>();
     AU.addRequired<TargetPassConfig>();
+
     AU.setPreservesCFG();
+    AU.addPreserved<UniformityInfoWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override {
     TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+    UniformityInfo &UA =
+        getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
     const TargetMachine &TM = TPC.getTM<TargetMachine>();
     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-    const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
-    return AMDGPUAnnotateVaryingBranchWeightsImpl(ST, TTI).run(F);
+    const TargetTransformInfo &GCNTTI = TM.getTargetTransformInfo(F);
+    return AMDGPUAnnotateVaryingBranchWeightsImpl(ST, GCNTTI, UA).run(F);
   }
 };
 
@@ -108,6 +112,7 @@ char AMDGPUAnnotateVaryingBranchWeightsLegacy::ID = 0;
 
 INITIALIZE_PASS_BEGIN(AMDGPUAnnotateVaryingBranchWeightsLegacy, DEBUG_TYPE,
                       "Annotate Varying Branch Weights", false, false)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPUAnnotateVaryingBranchWeightsLegacy, DEBUG_TYPE,
                     "Annotate Varying Branch Weights", false, false)
 
@@ -119,20 +124,22 @@ PreservedAnalyses
 AMDGPUAnnotateVaryingBranchWeightsPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-  const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
-  bool Changed = AMDGPUAnnotateVaryingBranchWeightsImpl(ST, TTI).run(F);
+  const TargetTransformInfo &GCNTTI = TM.getTargetTransformInfo(F);
+  UniformityInfo &UA = AM.getResult<UniformityInfoAnalysis>(F);
+  bool Changed = AMDGPUAnnotateVaryingBranchWeightsImpl(ST, GCNTTI, UA).run(F);
 
   if (!Changed)
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
+  PA.preserve<UniformityInfoAnalysis>();
   return PA;
 }
 
 bool AMDGPUAnnotateVaryingBranchWeightsImpl::isLikelyVarying(const Value *V) {
   // Check if V is a source of divergence or if it transitively uses one.
-  if (TTI.isSourceOfDivergence(V))
+  if (GCNTTI.isSourceOfDivergence(V))
     return true;
 
   auto *U = dyn_cast<User>(V);
@@ -175,26 +182,15 @@ bool AMDGPUAnnotateVaryingBranchWeightsImpl::run(Function &F) {
   if (WGSizes.first <= 1)
     return false;
 
-  using namespace PatternMatch;
-
   bool Changed = false;
   for (auto &BB : F) {
-    auto *Term = BB.getTerminator();
-    // Look for conditional branches whose condition is an ExtractValueInst
-    // that extracts the return value of a call to the amdgcn_if or amdgcn_else
-    // intrinsic.
-    if (match(Term, m_Br(m_ExtractValue<0>(m_CombineOr(
-                             m_Intrinsic<Intrinsic::amdgcn_if>(),
-                             m_Intrinsic<Intrinsic::amdgcn_else>())),
-                         m_Value(), m_Value()))) {
-      // The this condition is an artificial value resulting from the control
-      // flow intrinsic, not the actual branch condition. However, the
-      // intrinsics connect it via data flow with the actual condition
-      // (even for the amdgcn_else intrinsic, via the matching amdgcn_if
-      // intrinsic), so isLikelyVarying still produces meaningful results.
-      if (isLikelyVarying(cast<BranchInst>(Term)->getCondition()))
-        Changed |= setTrueSuccessorLikely(cast<BranchInst>(Term));
-    }
+    auto *Br = dyn_cast<BranchInst>(BB.getTerminator());
+    // Only consider statically non-uniform conditional branches.
+    if (!Br || !Br->isConditional() || UA.isUniform(Br))
+      continue;
+
+    if (isLikelyVarying(Br->getCondition()))
+      Changed |= setTrueSuccessorLikely(Br);
   }
 
   return Changed;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 63a7b0a50c4455..d8911cd8e0687e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -44,6 +44,9 @@ MODULE_PASS_WITH_PARAMS(
 #define FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
 FUNCTION_PASS("amdgpu-annotate-uniform", AMDGPUAnnotateUniformValuesPass())
+FUNCTION_PASS("amdgpu-annotate-varying-branch-weights",
+              AMDGPUAnnotateVaryingBranchWeightsPass(
+                  *static_cast<const GCNTargetMachine *>(this)))
 FUNCTION_PASS("amdgpu-codegenprepare", AMDGPUCodeGenPreparePass(*this))
 FUNCTION_PASS("amdgpu-image-intrinsic-opt",
               AMDGPUImageIntrinsicOptimizerPass(*this))
@@ -65,9 +68,6 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
               AMDGPUUnifyDivergentExitNodesPass())
 FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
 FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
-FUNCTION_PASS("amdgpu-annotate-varying-branch-weights",
-              AMDGPUAnnotateVaryingBranchWeightsPass(
-                  *static_cast<const GCNTargetMachine *>(this)))
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 325db6ca9b7ace..7ccefb9c2e0425 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -481,6 +481,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUAttributorLegacyPass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
+  initializeAMDGPUAnnotateVaryingBranchWeightsLegacyPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
   initializeAMDGPUAtomicOptimizerPass(*PR);
   initializeAMDGPULowerKernelArgumentsPass(*PR);
@@ -501,7 +502,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUReserveWWMRegsPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
-  initializeAMDGPUAnnotateVaryingBranchWeightsLegacyPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowLegacyPass(*PR);
   initializeAMDGPUInsertDelayAluPass(*PR);
@@ -1312,11 +1312,11 @@ bool GCNPassConfig::addPreISel() {
 
   addPass(createAMDGPUAnnotateUniformValuesLegacy());
   addPass(createSIAnnotateControlFlowLegacyPass());
+  addPass(createAMDGPUAnnotateVaryingBranchWeightsLegacyPass());
   // TODO: Move this right after structurizeCFG to avoid extra divergence
   // analysis. This depends on stopping SIAnnotateControlFlow from making
   // control flow modifications.
   addPass(createAMDGPURewriteUndefForPHILegacyPass());
-  addPass(createAMDGPUAnnotateVaryingBranchWeightsLegacyPass());
 
   addPass(createLCSSAPass());
 
@@ -2000,13 +2000,13 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
 
   addPass(SIAnnotateControlFlowPass(TM));
 
+  addPass(AMDGPUAnnotateVaryingBranchWeightsPass(TM));
+
   // TODO: Move this right after structurizeCFG to avoid extra divergence
   // analysis. This depends on stopping SIAnnotateControlFlow from making
   // control flow modifications.
   addPass(AMDGPURewriteUndefForPHIPass());
 
-  addPass(AMDGPUAnnotateVaryingBranchWeightsPass(TM));
-
   addPass(LCSSAPass());
 
   if (TM.getOptLevel() > CodeGenOptLevel::Less)
diff --git a/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll b/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
index f5f4c0a12eaeee..b312c896fcd4b6 100644
--- a/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
+++ b/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+sramecc,-xnack < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
 
 ; Check that simple conditional memory accesses that are guarded by likely
 ; varying conditions are not lowered with an s_cbranch_execz to bypass them.
@@ -62,5 +62,61 @@ exit:
   ret void
 }
 
+define amdgpu_kernel void @cond_ops_no_attrs(ptr addrspace(1) inreg %x, ptr addrspace(1) inreg %y) {
+; CHECK-LABEL: cond_ops_no_attrs:
+; CHECK:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; CHECK-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; CHECK-NEXT:    v_lshl_or_b32 v5, v0, 6, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 4, v5
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 4, v5
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT:    s_cbranch_execz .LBB1_2
+; CHECK-NEXT:  ; %bb.1: ; %do.load
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[8:9]
+; CHECK-NEXT:  .LBB1_2: ; %post.load
+; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v5
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v5
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT:    s_cbranch_execz .LBB1_4
+; CHECK-NEXT:  ; %bb.3: ; %do.store
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[10:11]
+; CHECK-NEXT:  .LBB1_4: ; %exit
+; CHECK-NEXT:    s_endpgm
+entry:
+  %tid.x = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %k = lshr i32 %tid, 4
+  %j = and i32 %tid, 15
+  %load.cond = icmp ult i32 %k, 15
+  %tid.ext = zext nneg i32 %tid to i64
+  %my.x = getelementptr <4 x float>, ptr addrspace(1) %x, i64 %tid.ext
+  br i1 %load.cond, label %do.load, label %post.load
+do.load:
+  %loaded = load <4 x float>, ptr addrspace(1) %my.x
+  br label %post.load
+post.load:
+  %maybe.loaded = phi <4 x float> [ %loaded, %do.load ], [ zeroinitializer, %entry ]
+  %my.y = getelementptr <4 x float>, ptr addrspace(1) %y, i64 %tid.ext
+  %store.cond = icmp ult i32 %j, 15
+  br i1 %store.cond, label %do.store, label %exit
+do.store:
+  store <4 x float> %maybe.loaded, ptr addrspace(1) %my.y
+  br label %exit
+exit:
+  ret void
+}
+
 attributes #0 = {"uniform-work-group-size"="true" "amdgpu-flat-work-group-size"="256,256"}
 !0 = !{i32 64, i32 4, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 65bb9a9652b17e..ea45acf0b50f31 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -86,8 +86,8 @@
 ; GCN-O0-NEXT:        SI annotate control flow
 ; GCN-O0-NEXT:        Cycle Info Analysis
 ; GCN-O0-NEXT:        Uniformity Analysis
-; GCN-O0-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O0-NEXT:        AMDGPU Annotate Varying Branch Weights
+; GCN-O0-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O0-NEXT:        LCSSA Verifier
 ; GCN-O0-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O0-NEXT:      DummyCGSCCPass
@@ -279,8 +279,8 @@
 ; GCN-O1-NEXT:        SI annotate control flow
 ; GCN-O1-NEXT:        Cycle Info Analysis
 ; GCN-O1-NEXT:        Uniformity Analysis
-; GCN-O1-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O1-NEXT:        AMDGPU Annotate Varying Branch Weights
+; GCN-O1-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O1-NEXT:        LCSSA Verifier
 ; GCN-O1-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O1-NEXT:      DummyCGSCCPass
@@ -580,8 +580,8 @@
 ; GCN-O1-OPTS-NEXT:        SI annotate control flow
 ; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
 ; GCN-O1-OPTS-NEXT:        Uniformity Analysis
-; GCN-O1-OPTS-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O1-OPTS-NEXT:        AMDGPU Annotate Varying Branch Weights
+; GCN-O1-OPTS-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O1-OPTS-NEXT:        LCSSA Verifier
 ; GCN-O1-OPTS-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O1-OPTS-NEXT:      DummyCGSCCPass
@@ -893,8 +893,8 @@
 ; GCN-O2-NEXT:        SI annotate control flow
 ; GCN-O2-NEXT:        Cycle Info Analysis
 ; GCN-O2-NEXT:        Uniformity Analysis
-; GCN-O2-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O2-NEXT:        AMDGPU Annotate Varying Branch Weights
+; GCN-O2-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O2-NEXT:        LCSSA Verifier
 ; GCN-O2-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O2-NEXT:      Analysis if a function is memory bound
@@ -1221,8 +1221,8 @@
 ; GCN-O3-NEXT:        SI annotate control flow
 ; GCN-O3-NEXT:        Cycle Info Analysis
 ; GCN-O3-NEXT:        Uniformity Analysis
-; GCN-O3-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O3-NEXT:        AMDGPU Annotate Varying Branch Weights
+; GCN-O3-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O3-NEXT:        LCSSA Verifier
 ; GCN-O3-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O3-NEXT:      Analysis if a function is memory bound
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 5db6dd8d64d283..05debf0d2c119c 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -294,7 +294,6 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_mov_b32 s52, 0
 ; CHECK-NEXT:    s_inst_prefetch 0x1
-; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB0_19: ; Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v58
@@ -913,7 +912,6 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_mov_b32 s47, 0
 ; CHECK-NEXT:    s_inst_prefetch 0x1
-; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB1_7: ; %.103
 ; CHECK-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2

>From 8cf0cbf8fb44c9630cb48f51f3b2e22b68598272 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Wed, 27 Nov 2024 10:18:16 -0500
Subject: [PATCH 3/4] fixup! fixup! [AMDGPU] Eliminate likely-spurious execz
 checks

Make the heuristic more strict, add IR tests
---
 .../AMDGPUAnnotateVaryingBranchWeights.cpp    |  80 ++-
 .../annotate-likely-varying-branches.ll       | 462 ++++++++++++++++++
 .../conditional-mem-no-cbranch-execz.ll       |  59 +--
 ...ne-sink-temporal-divergence-swdev407790.ll | 178 +++----
 4 files changed, 627 insertions(+), 152 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/annotate-likely-varying-branches.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
index b67e9bde06e1a6..44546cfbdae189 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
@@ -26,7 +26,11 @@
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Analysis.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/ProfDataUtils.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
@@ -44,7 +48,7 @@ class AMDGPUAnnotateVaryingBranchWeightsImpl {
   AMDGPUAnnotateVaryingBranchWeightsImpl(const GCNSubtarget &ST,
                                          const TargetTransformInfo &GCNTTI,
                                          UniformityInfo &UA)
-      : ST(ST), GCNTTI(GCNTTI), UA(UA) {
+      : ST(ST), UA(UA) {
     // Determine weights that signal that a branch is very likely to be
     // predicted correctly, i.e., whose ratio exceeds
     // TTI.getPredictableBranchThreshold().
@@ -59,11 +63,13 @@ class AMDGPUAnnotateVaryingBranchWeightsImpl {
 
 private:
   const GCNSubtarget &ST;
-  const TargetTransformInfo &GCNTTI;
   const UniformityInfo &UA;
   uint32_t LikelyWeight;
   uint32_t UnlikelyWeight;
   ValueMap<const Value *, bool> LikelyVaryingCache;
+  unsigned HighestLikelyVaryingDimension = 0;
+
+  bool isRelevantSourceOfDivergence(const Value *V) const;
 
   /// Heuristically check if it is likely that a wavefront has dynamically
   /// varying values for V.
@@ -72,7 +78,7 @@ class AMDGPUAnnotateVaryingBranchWeightsImpl {
   /// Set branch weights that signal that the "true" successor of Term is the
   /// likely destination, if no prior weights are present.
   /// Return true if weights were set.
-  bool setTrueSuccessorLikely(BranchInst *Term);
+  bool setTrueSuccessorLikely(BranchInst *Term) const;
 };
 
 class AMDGPUAnnotateVaryingBranchWeightsLegacy : public FunctionPass {
@@ -137,13 +143,43 @@ AMDGPUAnnotateVaryingBranchWeightsPass::run(Function &F,
   return PA;
 }
 
+bool AMDGPUAnnotateVaryingBranchWeightsImpl::isRelevantSourceOfDivergence(
+    const Value *V) const {
+  auto *II = dyn_cast<IntrinsicInst>(V);
+  if (!II)
+    return false;
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::amdgcn_workitem_id_z:
+  case Intrinsic::r600_read_tidig_z:
+    return HighestLikelyVaryingDimension >= 2;
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::r600_read_tidig_y:
+    return HighestLikelyVaryingDimension >= 1;
+  case Intrinsic::amdgcn_workitem_id_x:
+  case Intrinsic::r600_read_tidig_x:
+  case Intrinsic::amdgcn_mbcnt_hi:
+  case Intrinsic::amdgcn_mbcnt_lo:
+    return true;
+  }
+
+  return false;
+}
+
 bool AMDGPUAnnotateVaryingBranchWeightsImpl::isLikelyVarying(const Value *V) {
   // Check if V is a source of divergence or if it transitively uses one.
-  if (GCNTTI.isSourceOfDivergence(V))
+  if (isRelevantSourceOfDivergence(V))
     return true;
 
-  auto *U = dyn_cast<User>(V);
-  if (!U)
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  // ExtractValueInst and IntrinsicInst enable looking through the
+  // amdgcn_if/else intrinsics inserted by SIAnnotateControlFlow.
+  // This condition excludes PHINodes, which prevents infinite recursion.
+  if (!isa<BinaryOperator>(I) && !isa<UnaryOperator>(I) && !isa<CastInst>(I) &&
+      !isa<CmpInst>(I) && !isa<ExtractValueInst>(I) && !isa<IntrinsicInst>(I))
     return false;
 
   // Have we already checked V?
@@ -153,7 +189,7 @@ bool AMDGPUAnnotateVaryingBranchWeightsImpl::isLikelyVarying(const Value *V) {
 
   // Does it use a likely varying Value?
   bool Result = false;
-  for (const auto &Use : U->operands()) {
+  for (const auto &Use : I->operands()) {
     Result |= isLikelyVarying(Use);
     if (Result)
       break;
@@ -164,7 +200,7 @@ bool AMDGPUAnnotateVaryingBranchWeightsImpl::isLikelyVarying(const Value *V) {
 }
 
 bool AMDGPUAnnotateVaryingBranchWeightsImpl::setTrueSuccessorLikely(
-    BranchInst *Term) {
+    BranchInst *Term) const {
   assert(Term->isConditional());
 
   // Don't overwrite existing branch weights.
@@ -177,9 +213,33 @@ bool AMDGPUAnnotateVaryingBranchWeightsImpl::setTrueSuccessorLikely(
 }
 
 bool AMDGPUAnnotateVaryingBranchWeightsImpl::run(Function &F) {
+  unsigned MinWGSize = ST.getFlatWorkGroupSizes(F).first;
+  bool MustHaveMoreThanOneThread = MinWGSize > 1;
+
+  // reqd_work_group_size determines the size of the work group in every
+  // dimension. If it is present, identify the dimensions where the workitem id
+  // differs between the threads of the same wavefront. Otherwise assume that
+  // only dimension 0, i.e., x, varies.
+  //
+  // TODO can/should we assume that workitems are grouped into waves like that?
+  auto *Node = F.getMetadata("reqd_work_group_size");
+  if (Node && Node->getNumOperands() == 3) {
+    unsigned WavefrontSize = ST.getWavefrontSize();
+    unsigned ThreadsSoFar = 1;
+    unsigned Dim = 0;
+    for (; Dim < 3; ++Dim) {
+      ThreadsSoFar *=
+          mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
+      if (ThreadsSoFar >= WavefrontSize)
+        break;
+    }
+    HighestLikelyVaryingDimension = Dim;
+    LLVM_DEBUG(dbgs() << "Highest Likely Varying Dimension: " << Dim << '\n');
+    MustHaveMoreThanOneThread |= ThreadsSoFar > 1;
+  }
+
   // If the workgroup has only a single thread, the condition cannot vary.
-  const auto WGSizes = ST.getFlatWorkGroupSizes(F);
-  if (WGSizes.first <= 1)
+  if (!MustHaveMoreThanOneThread)
     return false;
 
   bool Changed = false;
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-likely-varying-branches.ll b/llvm/test/CodeGen/AMDGPU/annotate-likely-varying-branches.ll
new file mode 100644
index 00000000000000..26956868ddce58
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/annotate-likely-varying-branches.ll
@@ -0,0 +1,462 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --amdgpu-annotate-varying-branch-weights -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+
+; The branch here is likely varying:
+define amdgpu_kernel void @cond_store_even(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR1:[0-9]+]] !reqd_work_group_size [[META0:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[DO_STORE:.*]], label %[[EXIT:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid, 1
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is likely varying:
+define amdgpu_kernel void @cond_store_even_ann_cf(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_ann_cf(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR1]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    [[CF_VAL:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]])
+; CHECK-NEXT:    [[PSEUDO_COND:%.*]] = extractvalue { i1, i64 } [[CF_VAL]], 0
+; CHECK-NEXT:    [[MASK:%.*]] = extractvalue { i1, i64 } [[CF_VAL]], 1
+; CHECK-NEXT:    br i1 [[PSEUDO_COND]], label %[[DO_STORE:.*]], label %[[EXIT:.*]], !prof [[PROF1]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[MASK]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid, 1
+  %cond = icmp eq i32 %lsbit, 0
+  %cf_val = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
+  %pseudo.cond = extractvalue { i1, i64 } %cf_val, 0
+  %mask = extractvalue { i1, i64 } %cf_val, 1
+  br i1 %pseudo.cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  call void @llvm.amdgcn.end.cf.i64(i64 %mask)
+  ret void
+}
+
+
+; The branch here is likely varying:
+define amdgpu_kernel void @cond_store_complex1(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_complex1(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR1]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[K:%.*]] = lshr i32 [[TID]], 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[K]], 15
+; CHECK-NEXT:    br i1 [[COND]], label %[[DO_STORE:.*]], label %[[EXIT:.*]], !prof [[PROF1]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+  %tid.x = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %k = lshr i32 %tid, 4
+  %cond = icmp ult i32 %k, 15
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is likely varying:
+define amdgpu_kernel void @cond_store_complex2(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_complex2(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR1]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[J:%.*]] = and i32 [[TID]], 15
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[J]], 15
+; CHECK-NEXT:    br i1 [[COND]], label %[[DO_STORE:.*]], label %[[EXIT:.*]], !prof [[PROF1]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+  %tid.x = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %j = and i32 %tid, 15
+  %cond = icmp ult i32 %j, 15
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is likely varying:
+define amdgpu_kernel void @cond_store_even_only_reqd_wgsz(ptr addrspace(1) inreg %dest) !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_only_reqd_wgsz(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2:[0-9]+]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[DO_STORE:.*]], label %[[EXIT:.*]], !prof [[PROF1]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid, 1
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is likely varying:
+define amdgpu_kernel void @cond_store_even_only_flat_wgsz(ptr addrspace(1) inreg %dest) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_only_flat_wgsz(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[DO_STORE:.*]], label %[[EXIT:.*]], !prof [[PROF1]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid, 1
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is likely varying, since the y dimension varies in each
+; wavefront with the required work group size:
+define amdgpu_kernel void @cond_store_even_ydim_small_wgs(ptr addrspace(1) inreg %dest) !reqd_work_group_size !1 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_ydim_small_wgs(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META2:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 3
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID_Y]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[DO_STORE:.*]], label %[[EXIT:.*]], !prof [[PROF1]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 3
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid.y, 1
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is not likely varying, because there are no attributes with
+; work group size information:
+define amdgpu_kernel void @cond_store_even_no_attributes(ptr addrspace(1) inreg %dest) {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_no_attributes(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid, 1
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is not likely varying, because the condition only depends on a
+; workitem id dimension that does not vary per wavefront (namely y):
+define amdgpu_kernel void @cond_store_even_ydim(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_ydim(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR1]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID_Y]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid.y, 1
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is not likely varying, because its condition is directly
+; loaded from memory:
+define amdgpu_kernel void @cond_store_loaded(ptr addrspace(1) inreg %dest, ptr addrspace(1) inreg %lookup) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_loaded(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]], ptr addrspace(1) inreg [[LOOKUP:%.*]]) #[[ATTR1]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LOOKUP_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[LOOKUP]], i64 [[TID_EXT]]
+; CHECK-NEXT:    [[LOOKUP_VALUE:%.*]] = load i32, ptr addrspace(1) [[LOOKUP_ADDR]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LOOKUP_VALUE]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lookup.addr = getelementptr i32, ptr addrspace(1) %lookup, i64 %tid.ext
+  %lookup.value = load i32, ptr addrspace(1) %lookup.addr
+  %cond = icmp eq i32 %lookup.value, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is not likely varying, because its condition directly results from a PHI:
+define amdgpu_kernel void @cond_store_loop_phi(ptr addrspace(1) inreg %dest, ptr addrspace(1) inreg %lookup, i32 %n) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_loop_phi(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]], ptr addrspace(1) inreg [[LOOKUP:%.*]], i32 [[N:%.*]]) #[[ATTR1]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ [[VAL_INC:%.*]], %[[LOOP]] ], [ [[TID]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_DEC:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[VAL_INC]] = add i32 [[VAL]], 1
+; CHECK-NEXT:    [[IDX_DEC]] = sub i32 [[IDX]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp eq i32 [[IDX_DEC]], 0
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label %[[LOOP_END:.*]], label %[[LOOP]]
+; CHECK:       [[LOOP_END]]:
+; CHECK-NEXT:    [[LOOKUP_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[LOOKUP]], i64 [[TID_EXT]]
+; CHECK-NEXT:    [[LOOKUP_VALUE:%.*]] = load i32, ptr addrspace(1) [[LOOKUP_ADDR]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LOOKUP_VALUE]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  br label %loop
+loop:
+  %val = phi i32 [%val.inc, %loop], [%tid, %entry]
+  %idx = phi i32 [%idx.dec, %loop], [%n, %entry]
+  %val.inc = add i32 %val, 1
+  %idx.dec = sub i32 %idx, 1
+  %loop.cond = icmp eq i32 %idx.dec, 0
+  br i1 %loop.cond, label %loop.end, label %loop
+loop.end:
+  %lookup.addr = getelementptr i32, ptr addrspace(1) %lookup, i64 %tid.ext
+  %lookup.value = load i32, ptr addrspace(1) %lookup.addr
+  %cond = icmp eq i32 %lookup.value, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
+!0 = !{i32 64, i32 4, i32 1}
+!1 = !{i32 8, i32 32, i32 1}
+;.
+; CHECK: [[META0]] = !{i32 64, i32 4, i32 1}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 2126008812, i32 21474835}
+; CHECK: [[META2]] = !{i32 8, i32 32, i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll b/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
index b312c896fcd4b6..6b1a6fc6318b3e 100644
--- a/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
+++ b/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
@@ -7,7 +7,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.workitem.id.y()
 
-define amdgpu_kernel void @cond_ops(ptr addrspace(1) inreg %x, ptr addrspace(1) inreg %y) #0 !reqd_work_group_size !0 {
+define amdgpu_kernel void @cond_ops(ptr addrspace(1) inreg %x, ptr addrspace(1) inreg %y) !reqd_work_group_size !0 {
 ; CHECK-LABEL: cond_ops:
 ; CHECK:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; CHECK-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -62,61 +62,4 @@ exit:
   ret void
 }
 
-define amdgpu_kernel void @cond_ops_no_attrs(ptr addrspace(1) inreg %x, ptr addrspace(1) inreg %y) {
-; CHECK-LABEL: cond_ops_no_attrs:
-; CHECK:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; CHECK-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; CHECK-NEXT:  ; %bb.0: ; %entry
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; CHECK-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v0, 6, v1
-; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 4, v5
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 4, v5
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; CHECK-NEXT:    s_cbranch_execz .LBB1_2
-; CHECK-NEXT:  ; %bb.1: ; %do.load
-; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[8:9]
-; CHECK-NEXT:  .LBB1_2: ; %post.load
-; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
-; CHECK-NEXT:    v_and_b32_e32 v5, 15, v5
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v5
-; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; CHECK-NEXT:    s_cbranch_execz .LBB1_4
-; CHECK-NEXT:  ; %bb.3: ; %do.store
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[10:11]
-; CHECK-NEXT:  .LBB1_4: ; %exit
-; CHECK-NEXT:    s_endpgm
-entry:
-  %tid.x = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
-  %tid.y = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
-  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
-  %tid = or disjoint i32 %tid.x, %tid.y.shift
-  %k = lshr i32 %tid, 4
-  %j = and i32 %tid, 15
-  %load.cond = icmp ult i32 %k, 15
-  %tid.ext = zext nneg i32 %tid to i64
-  %my.x = getelementptr <4 x float>, ptr addrspace(1) %x, i64 %tid.ext
-  br i1 %load.cond, label %do.load, label %post.load
-do.load:
-  %loaded = load <4 x float>, ptr addrspace(1) %my.x
-  br label %post.load
-post.load:
-  %maybe.loaded = phi <4 x float> [ %loaded, %do.load ], [ zeroinitializer, %entry ]
-  %my.y = getelementptr <4 x float>, ptr addrspace(1) %y, i64 %tid.ext
-  %store.cond = icmp ult i32 %j, 15
-  br i1 %store.cond, label %do.store, label %exit
-do.store:
-  store <4 x float> %maybe.loaded, ptr addrspace(1) %my.y
-  br label %exit
-exit:
-  ret void
-}
-
-attributes #0 = {"uniform-work-group-size"="true" "amdgpu-flat-work-group-size"="256,256"}
 !0 = !{i32 64, i32 4, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 05debf0d2c119c..c826980991f94f 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -146,8 +146,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_mov_b32_e32 v47, 0
 ; CHECK-NEXT:    s_mov_b32 s47, 0
 ; CHECK-NEXT:  .LBB0_5: ; =>This Loop Header: Depth=1
-; CHECK-NEXT:    ; Child Loop BB0_7 Depth 2
-; CHECK-NEXT:    ; Child Loop BB0_19 Depth 2
+; CHECK-NEXT:    ; Child Loop BB0_8 Depth 2
+; CHECK-NEXT:    ; Child Loop BB0_20 Depth 2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, s47, v44
 ; CHECK-NEXT:    s_lshl_b32 s4, s47, 5
 ; CHECK-NEXT:    s_add_i32 s46, s47, 1
@@ -163,7 +163,20 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_mov_b32 s53, 0
 ; CHECK-NEXT:    s_mov_b32 s56, 0
-; CHECK-NEXT:  .LBB0_7: ; Parent Loop BB0_5 Depth=1
+; CHECK-NEXT:    s_branch .LBB0_8
+; CHECK-NEXT:  .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT:    s_add_i32 s56, s56, 4
+; CHECK-NEXT:    s_add_i32 s4, s47, s56
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, s56, v57
+; CHECK-NEXT:    s_add_i32 s5, s4, 5
+; CHECK-NEXT:    s_add_i32 s4, s4, 1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s5, v42
+; CHECK-NEXT:    v_mov_b32_e32 v58, s4
+; CHECK-NEXT:    s_or_b32 s53, vcc_lo, s53
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s53
+; CHECK-NEXT:    s_cbranch_execz .LBB0_16
+; CHECK-NEXT:  .LBB0_8: ; Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v59, s56, v46
 ; CHECK-NEXT:    v_add_nc_u32_e32 v58, s56, v57
@@ -171,8 +184,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_9
-; CHECK-NEXT:  ; %bb.8: ; in Loop: Header=BB0_7 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_10
+; CHECK-NEXT:  ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -190,14 +203,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v58
-; CHECK-NEXT:  .LBB0_9: ; in Loop: Header=BB0_7 Depth=2
+; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_11
-; CHECK-NEXT:  ; %bb.10: ; in Loop: Header=BB0_7 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_12
+; CHECK-NEXT:  ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -216,14 +229,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v60
-; CHECK-NEXT:  .LBB0_11: ; in Loop: Header=BB0_7 Depth=2
+; CHECK-NEXT:  .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_13
-; CHECK-NEXT:  ; %bb.12: ; in Loop: Header=BB0_7 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_14
+; CHECK-NEXT:  ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -242,14 +255,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v60
-; CHECK-NEXT:  .LBB0_13: ; in Loop: Header=BB0_7 Depth=2
+; CHECK-NEXT:  .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_15
-; CHECK-NEXT:  ; %bb.14: ; in Loop: Header=BB0_7 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_7
+; CHECK-NEXT:  ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -268,19 +281,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v58
-; CHECK-NEXT:  .LBB0_15: ; in Loop: Header=BB0_7 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
-; CHECK-NEXT:    s_add_i32 s56, s56, 4
-; CHECK-NEXT:    s_add_i32 s4, s47, s56
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, s56, v57
-; CHECK-NEXT:    s_add_i32 s5, s4, 5
-; CHECK-NEXT:    s_add_i32 s4, s4, 1
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s5, v42
-; CHECK-NEXT:    v_mov_b32_e32 v58, s4
-; CHECK-NEXT:    s_or_b32 s53, vcc_lo, s53
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s53
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_7
-; CHECK-NEXT:  ; %bb.16: ; %Flow45
+; CHECK-NEXT:    s_branch .LBB0_7
+; CHECK-NEXT:  .LBB0_16: ; %Flow45
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
 ; CHECK-NEXT:    v_mov_b32_e32 v57, v0
@@ -294,15 +296,25 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_mov_b32 s52, 0
 ; CHECK-NEXT:    s_inst_prefetch 0x1
-; CHECK-NEXT:  .LBB0_19: ; Parent Loop BB0_5 Depth=1
+; CHECK-NEXT:    s_branch .LBB0_20
+; CHECK-NEXT:    .p2align 6
+; CHECK-NEXT:  .LBB0_19: ; in Loop: Header=BB0_20 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
+; CHECK-NEXT:    v_add_nc_u32_e32 v58, 1, v58
+; CHECK-NEXT:    v_add_nc_u32_e32 v57, 1, v57
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v58, v42
+; CHECK-NEXT:    s_or_b32 s52, vcc_lo, s52
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s52
+; CHECK-NEXT:    s_cbranch_execz .LBB0_22
+; CHECK-NEXT:  .LBB0_20: ; Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v58
 ; CHECK-NEXT:    ds_read_u8 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s53, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_21
-; CHECK-NEXT:  ; %bb.20: ; in Loop: Header=BB0_19 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_19
+; CHECK-NEXT:  ; %bb.21: ; in Loop: Header=BB0_20 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -320,15 +332,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v57
-; CHECK-NEXT:  .LBB0_21: ; in Loop: Header=BB0_19 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
-; CHECK-NEXT:    v_add_nc_u32_e32 v58, 1, v58
-; CHECK-NEXT:    v_add_nc_u32_e32 v57, 1, v57
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v58, v42
-; CHECK-NEXT:    s_or_b32 s52, vcc_lo, s52
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s52
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_19
-; CHECK-NEXT:  ; %bb.22: ; %Flow43
+; CHECK-NEXT:    s_branch .LBB0_19
+; CHECK-NEXT:  .LBB0_22: ; %Flow43
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_inst_prefetch 0x2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s52
@@ -369,7 +374,29 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_33
 ; CHECK-NEXT:  ; %bb.26:
 ; CHECK-NEXT:    s_mov_b32 s44, 0
-; CHECK-NEXT:  .LBB0_27: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_branch .LBB0_28
+; CHECK-NEXT:  .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s45
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s43
+; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, _Z14get_local_sizej at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, _Z14get_local_sizej at rel32@hi+12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_add_co_u32 v41, vcc_lo, v0, v41
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v47, v41
+; CHECK-NEXT:    s_or_b32 s44, vcc_lo, s44
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
+; CHECK-NEXT:    s_cbranch_execz .LBB0_33
+; CHECK-NEXT:  .LBB0_28: ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v41
 ; CHECK-NEXT:    s_mov_b32 s45, exec_lo
 ; CHECK-NEXT:    ds_read_b32 v0, v0
@@ -397,8 +424,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_or_b32_e32 v5, v46, v57
 ; CHECK-NEXT:    v_or_b32_e32 v4, v45, v56
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
-; CHECK-NEXT:    s_cbranch_execz .LBB0_32
-; CHECK-NEXT:  ; %bb.28: ; in Loop: Header=BB0_27 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_27
+; CHECK-NEXT:  ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
 ; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    global_load_dwordx2 v[58:59], v[2:3], off offset:24
 ; CHECK-NEXT:    global_load_dwordx2 v[60:61], v[0:1], off offset:24
@@ -438,8 +465,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_mov_b32 s4, exec_lo
 ; CHECK-NEXT:    v_cmpx_gt_u32_e32 12, v0
 ; CHECK-NEXT:    s_xor_b32 s4, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_30
-; CHECK-NEXT:  ; %bb.29: ; in Loop: Header=BB0_27 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_31
+; CHECK-NEXT:  ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
 ; CHECK-NEXT:    v_xor_b32_e32 v4, v60, v58
 ; CHECK-NEXT:    v_lshrrev_b64 v[2:3], 16, v[56:57]
 ; CHECK-NEXT:    v_mad_u64_u32 v[6:7], null, 0x180, v73, s[50:51]
@@ -462,11 +489,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    global_store_dword v[6:7], v8, off offset:4
 ; CHECK-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off offset:8
 ; CHECK-NEXT:    global_store_dwordx2 v[6:7], v[4:5], off offset:24
-; CHECK-NEXT:  .LBB0_30: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB0_27 Depth=1
+; CHECK-NEXT:  .LBB0_31: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_28 Depth=1
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s4, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_32
-; CHECK-NEXT:  ; %bb.31: ; in Loop: Header=BB0_27 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_27
+; CHECK-NEXT:  ; %bb.32: ; in Loop: Header=BB0_28 Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v42
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v43
@@ -483,27 +510,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z10atomic_subPU3AS1Vjj at rel32@hi+12
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:  .LBB0_32: ; in Loop: Header=BB0_27 Depth=1
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s45
-; CHECK-NEXT:    v_mov_b32_e32 v31, v40
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_add_u32 s8, s34, 40
-; CHECK-NEXT:    s_addc_u32 s9, s35, 0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
-; CHECK-NEXT:    s_mov_b32 s14, s33
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, _Z14get_local_sizej at rel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, _Z14get_local_sizej at rel32@hi+12
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_add_co_u32 v41, vcc_lo, v0, v41
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v47, v41
-; CHECK-NEXT:    s_or_b32 s44, vcc_lo, s44
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_27
+; CHECK-NEXT:    s_branch .LBB0_27
 ; CHECK-NEXT:  .LBB0_33:
 ; CHECK-NEXT:    s_endpgm
   %6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
@@ -868,7 +875,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:  .LBB1_1: ; %.37
 ; CHECK-NEXT:    ; =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ; Child Loop BB1_3 Depth 2
-; CHECK-NEXT:    ; Child Loop BB1_7 Depth 2
+; CHECK-NEXT:    ; Child Loop BB1_8 Depth 2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, s4, v44
 ; CHECK-NEXT:    s_lshl_b32 s5, s4, 5
 ; CHECK-NEXT:    s_add_i32 s45, s4, 1
@@ -912,7 +919,18 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_mov_b32 s47, 0
 ; CHECK-NEXT:    s_inst_prefetch 0x1
-; CHECK-NEXT:  .LBB1_7: ; %.103
+; CHECK-NEXT:    s_branch .LBB1_8
+; CHECK-NEXT:    .p2align 6
+; CHECK-NEXT:  .LBB1_7: ; %.114
+; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s48
+; CHECK-NEXT:    v_add_nc_u32_e32 v56, 1, v56
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v56, v41
+; CHECK-NEXT:    s_or_b32 s47, vcc_lo, s47
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s47
+; CHECK-NEXT:    s_cbranch_execz .LBB1_10
+; CHECK-NEXT:  .LBB1_8: ; %.103
 ; CHECK-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v56
@@ -920,9 +938,9 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s48, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB1_9
-; CHECK-NEXT:  ; %bb.8: ; %.110
-; CHECK-NEXT:    ; in Loop: Header=BB1_7 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB1_7
+; CHECK-NEXT:  ; %bb.9: ; %.110
+; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s38, 40
@@ -940,16 +958,8 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v47
-; CHECK-NEXT:  .LBB1_9: ; %.114
-; CHECK-NEXT:    ; in Loop: Header=BB1_7 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s48
-; CHECK-NEXT:    v_add_nc_u32_e32 v56, 1, v56
-; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v56, v41
-; CHECK-NEXT:    s_or_b32 s47, vcc_lo, s47
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s47
-; CHECK-NEXT:    s_cbranch_execnz .LBB1_7
-; CHECK-NEXT:  ; %bb.10: ; %Flow
+; CHECK-NEXT:    s_branch .LBB1_7
+; CHECK-NEXT:  .LBB1_10: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_inst_prefetch 0x2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s47

>From 086ac46bce78a849eae39275c7601ad645783024 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Fri, 3 Jan 2025 04:03:48 -0500
Subject: [PATCH 4/4] Make comments refer to lanes instead of threads where
 lanes are meant.

---
 .../Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
index 44546cfbdae189..bce83493970dd1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
@@ -7,14 +7,14 @@
 //===----------------------------------------------------------------------===//
 // Estimate if conditional branches for which SIAnnotateControlFlow introduced
 // amdgcn_if or amdgcn_else intrinsics are likely to have different outcomes for
-// the threads of each wavefront. If that is the case, BranchWeight metadata is
+// the lanes of each wavefront. If that is the case, BranchWeight metadata is
 // added to signal that "then" and "else" blocks are both likely to be executed.
 // This may introduce branch weights that would be self-contradictory in a
 // non-SIMT setting.
 //
 // A consequence of this is that SIPreEmitPeephole is more likely to eliminate
 // s_cbranch_execz instructions that were introduced to skip these blocks when
-// no thread in the wavefront is active for them.
+// no lane in the wavefront is active for them.
 //
 // Should only run after SIAnnotateControlFlow.
 //===----------------------------------------------------------------------===//
@@ -218,7 +218,7 @@ bool AMDGPUAnnotateVaryingBranchWeightsImpl::run(Function &F) {
 
   // reqd_work_group_size determines the size of the work group in every
   // dimension. If it is present, identify the dimensions where the workitem id
-  // differs between the threads of the same wavefront. Otherwise assume that
+  // differs between the lanes of the same wavefront. Otherwise assume that
   // only dimension 0, i.e., x, varies.
   //
   // TODO can/should we assume that workitems are grouped into waves like that?



More information about the llvm-commits mailing list