[llvm] [AMDGPU] Eliminate likely-spurious execz checks (PR #117567)

Mon Nov 25 07:06:26 PST 2024

https://github.com/ritter-x2a created https://github.com/llvm/llvm-project/pull/117567

Currently, we introduce branches to skip conditionally executed instructions if the EXEC mask is zero and only eliminate them if the scheduling model says that executing the skipped instructions is cheaper than taking the branch instruction.

This patch adds branch weights to branches where the threads of a wavefront are likely (according to a heuristic) to have dynamically varying values for the branch condition. This causes SIPreEmitPeephole to eliminate the corresponding execz branch.

Currently, this is implemented as a new middle end pass with a rather optimistic heuristic, to gather initial feedback.

>From 3870adfa8ff6fc31e1efcfbfb120740ca17461ee Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Mon, 25 Nov 2024 09:44:23 -0500
Subject: [PATCH] [AMDGPU] Eliminate likely-spurious execz checks

Currently, we introduce branches to skip conditionally executed
instructions if the EXEC mask is zero and only eliminate them if the
scheduling model says that executing the skipped instructions is cheaper
than taking the branch instruction.

This patch adds branch weights to branches where the threads of a
wavefront are likely (according to a heuristic) to have dynamically
varying values for the branch condition. This causes SIPreEmitPeephole
to eliminate the corresponding execz branch.

Currently, this is implemented as a new middle end pass with a rather
optimistic heuristic, to gather initial feedback.
---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  15 ++
 .../AMDGPUAnnotateVaryingBranchWeights.cpp    | 201 ++++++++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   3 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   4 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 .../conditional-mem-no-cbranch-execz.ll       |  66 ++++++
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   5 +
 ...ne-sink-temporal-divergence-swdev407790.ll | 176 ++++++++-------
 .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn |   1 +
 9 files changed, 380 insertions(+), 92 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 95d0ad0f9dc96a..e7914d1de4a8fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -380,6 +380,10 @@ FunctionPass *createAMDGPURewriteUndefForPHILegacyPass();
 void initializeAMDGPURewriteUndefForPHILegacyPass(PassRegistry &);
 extern char &AMDGPURewriteUndefForPHILegacyPassID;
 
+FunctionPass *createAMDGPUAnnotateVaryingBranchWeightsLegacyPass();
+void initializeAMDGPUAnnotateVaryingBranchWeightsLegacyPass(PassRegistry &);
+extern char &AMDGPUAnnotateVaryingBranchWeightsLegacyPassID;
+
 class AMDGPURewriteUndefForPHIPass
     : public PassInfoMixin<AMDGPURewriteUndefForPHIPass> {
 public:
@@ -397,6 +401,17 @@ class SIAnnotateControlFlowPass
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+class AMDGPUAnnotateVaryingBranchWeightsPass
+    : public PassInfoMixin<AMDGPUAnnotateVaryingBranchWeightsPass> {
+private:
+  const AMDGPUTargetMachine &TM;
+
+public:
+  AMDGPUAnnotateVaryingBranchWeightsPass(const AMDGPUTargetMachine &TM)
+      : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 void initializeSIAnnotateControlFlowLegacyPass(PassRegistry &);
 extern char &SIAnnotateControlFlowLegacyPassID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
new file mode 100644
index 00000000000000..3c637290cbdbe3
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateVaryingBranchWeights.cpp
@@ -0,0 +1,201 @@
+//===- AMDGPUAnnotateVaryingBranchWeights.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Estimate if conditional branches for which SIAnnotateControlFlow introduced
+// amdgcn_if or amdgcn_else intrinsics are likely to have different outcomes for
+// the threads of each wavefront. If that is the case, BranchWeight metadata is
+// added to signal that "then" and "else" blocks are both likely to be executed.
+// This may introduce branch weights that would be self-contradictory in a
+// non-SIMT setting.
+//
+// A consequence of this is that SIPreEmitPeephole is more likely to eliminate
+// s_cbranch_execz instructions that were introduced to skip these blocks when
+// no thread in the wavefront is active for them.
+//
+// Should only run after SIAnnotateControlFlow.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-annotate-varying-branch-weights"
+
+namespace {
+
+class AMDGPUAnnotateVaryingBranchWeightsImpl {
+public:
+  AMDGPUAnnotateVaryingBranchWeightsImpl() = delete;
+  AMDGPUAnnotateVaryingBranchWeightsImpl(const GCNSubtarget &ST,
+                                         const TargetTransformInfo &TTI)
+      : ST(ST), TTI(TTI) {
+    // Determine weights that signal that a branch is very likely to be
+    // predicted correctly, i.e., whose ratio exceeds
+    // TTI.getPredictableBranchThreshold().
+    auto BranchProbThreshold = TTI.getPredictableBranchThreshold();
+    LikelyWeight = BranchProbThreshold.getNumerator();
+    UnlikelyWeight = BranchProbThreshold.getDenominator() - LikelyWeight;
+    if (UnlikelyWeight > 0)
+      --UnlikelyWeight;
+  }
+
+  bool run(Function &F);
+
+private:
+  const GCNSubtarget &ST;
+  const TargetTransformInfo &TTI;
+  uint32_t LikelyWeight;
+  uint32_t UnlikelyWeight;
+  ValueMap<const Value *, bool> LikelyVaryingCache;
+
+  /// Heuristically check if it is likely that a wavefront has dynamically
+  /// varying values for V.
+  bool isLikelyVarying(const Value *V);
+
+  /// Set branch weights that signal that the "true" successor of Term is the
+  /// likely destination, if no prior weights are present.
+  /// Return true if weights were set.
+  bool setTrueSuccessorLikely(BranchInst *Term);
+};
+
+class AMDGPUAnnotateVaryingBranchWeightsLegacy : public FunctionPass {
+public:
+  static char ID;
+  AMDGPUAnnotateVaryingBranchWeightsLegacy() : FunctionPass(ID) {
+    initializeAMDGPUAnnotateVaryingBranchWeightsLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "AMDGPU Annotate Varying Branch Weights";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesCFG();
+  }
+
+  bool runOnFunction(Function &F) override {
+    TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+    const TargetMachine &TM = TPC.getTM<TargetMachine>();
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+    const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
+    return AMDGPUAnnotateVaryingBranchWeightsImpl(ST, TTI).run(F);
+  }
+};
+
+} // end anonymous namespace
+
+char AMDGPUAnnotateVaryingBranchWeightsLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateVaryingBranchWeightsLegacy, DEBUG_TYPE,
+                      "Annotate Varying Branch Weights", false, false)
+INITIALIZE_PASS_END(AMDGPUAnnotateVaryingBranchWeightsLegacy, DEBUG_TYPE,
+                    "Annotate Varying Branch Weights", false, false)
+
+FunctionPass *llvm::createAMDGPUAnnotateVaryingBranchWeightsLegacyPass() {
+  return new AMDGPUAnnotateVaryingBranchWeightsLegacy();
+}
+
+PreservedAnalyses
+AMDGPUAnnotateVaryingBranchWeightsPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
+  bool Changed = AMDGPUAnnotateVaryingBranchWeightsImpl(ST, TTI).run(F);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+bool AMDGPUAnnotateVaryingBranchWeightsImpl::isLikelyVarying(const Value *V) {
+  // Check if V is a source of divergence or if it transitively uses one.
+  if (TTI.isSourceOfDivergence(V))
+    return true;
+
+  auto *U = dyn_cast<User>(V);
+  if (!U)
+    return false;
+
+  // Have we already checked V?
+  auto CacheEntry = LikelyVaryingCache.find(V);
+  if (CacheEntry != LikelyVaryingCache.end())
+    return CacheEntry->second;
+
+  // Does it use a likely varying Value?
+  bool Result = false;
+  for (const auto &Use : U->operands()) {
+    Result |= isLikelyVarying(Use);
+    if (Result)
+      break;
+  }
+
+  LikelyVaryingCache.insert({V, Result});
+  return Result;
+}
+
+bool AMDGPUAnnotateVaryingBranchWeightsImpl::setTrueSuccessorLikely(
+    BranchInst *Term) {
+  assert(Term->isConditional());
+
+  // Don't overwrite existing branch weights.
+  if (hasProfMD(*Term))
+    return false;
+
+  llvm::setBranchWeights(*Term, {LikelyWeight, UnlikelyWeight}, false);
+  LLVM_DEBUG(dbgs() << "Added branch weights: " << *Term << '\n');
+  return true;
+}
+
+bool AMDGPUAnnotateVaryingBranchWeightsImpl::run(Function &F) {
+  // If the workgroup has only a single thread, the condition cannot vary.
+  const auto WGSizes = ST.getFlatWorkGroupSizes(F);
+  if (WGSizes.first <= 1)
+    return false;
+
+  using namespace PatternMatch;
+
+  bool Changed = false;
+  for (auto &BB : F) {
+    auto *Term = BB.getTerminator();
+    // Look for conditional branches whose condition is an ExtractValueInst
+    // that extracts the return value of a call to the amdgcn_if or amdgcn_else
+    // intrinsic.
+    if (match(Term, m_Br(m_ExtractValue<0>(m_CombineOr(
+                             m_Intrinsic<Intrinsic::amdgcn_if>(),
+                             m_Intrinsic<Intrinsic::amdgcn_else>())),
+                         m_Value(), m_Value()))) {
+      // The this condition is an artificial value resulting from the control
+      // flow intrinsic, not the actual branch condition. However, the
+      // intrinsics connect it via data flow with the actual condition
+      // (even for the amdgcn_else intrinsic, via the matching amdgcn_if
+      // intrinsic), so isLikelyVarying still produces meaningful results.
+      if (isLikelyVarying(cast<BranchInst>(Term)->getCondition()))
+        Changed |= setTrueSuccessorLikely(cast<BranchInst>(Term));
+    }
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 174a90f0aa419d..63a7b0a50c4455 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -65,6 +65,9 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
               AMDGPUUnifyDivergentExitNodesPass())
 FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
 FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("amdgpu-annotate-varying-branch-weights",
+              AMDGPUAnnotateVaryingBranchWeightsPass(
+                  *static_cast<const GCNTargetMachine *>(this)))
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index da18f2b20f1427..325db6ca9b7ace 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -501,6 +501,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUReserveWWMRegsPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
+  initializeAMDGPUAnnotateVaryingBranchWeightsLegacyPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowLegacyPass(*PR);
   initializeAMDGPUInsertDelayAluPass(*PR);
@@ -1315,6 +1316,7 @@ bool GCNPassConfig::addPreISel() {
   // analysis. This depends on stopping SIAnnotateControlFlow from making
   // control flow modifications.
   addPass(createAMDGPURewriteUndefForPHILegacyPass());
+  addPass(createAMDGPUAnnotateVaryingBranchWeightsLegacyPass());
 
   addPass(createLCSSAPass());
 
@@ -2003,6 +2005,8 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
   // control flow modifications.
   addPass(AMDGPURewriteUndefForPHIPass());
 
+  addPass(AMDGPUAnnotateVaryingBranchWeightsPass(TM));
+
   addPass(LCSSAPass());
 
   if (TM.getOptLevel() > CodeGenOptLevel::Less)
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index fed29c3e14aae2..7e85770c70d5ff 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
+  AMDGPUAnnotateVaryingBranchWeights.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll b/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
new file mode 100644
index 00000000000000..f5f4c0a12eaeee
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+sramecc,-xnack < %s | FileCheck %s
+
+; Check that simple conditional memory accesses that are guarded by likely
+; varying conditions are not lowered with an s_cbranch_execz to bypass them.
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+
+define amdgpu_kernel void @cond_ops(ptr addrspace(1) inreg %x, ptr addrspace(1) inreg %y) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: cond_ops:
+; CHECK:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; CHECK-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; CHECK-NEXT:    v_lshl_or_b32 v5, v0, 6, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 4, v5
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 4, v5
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT:  ; %bb.1: ; %do.load
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[8:9]
+; CHECK-NEXT:  ; %bb.2: ; %post.load
+; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v5
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v5
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT:    s_cbranch_execz .LBB0_4
+; CHECK-NEXT:  ; %bb.3: ; %do.store
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[10:11]
+; CHECK-NEXT:  .LBB0_4: ; %exit
+; CHECK-NEXT:    s_endpgm
+entry:
+  %tid.x = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %k = lshr i32 %tid, 4
+  %j = and i32 %tid, 15
+  %load.cond = icmp ult i32 %k, 15
+  %tid.ext = zext nneg i32 %tid to i64
+  %my.x = getelementptr <4 x float>, ptr addrspace(1) %x, i64 %tid.ext
+  br i1 %load.cond, label %do.load, label %post.load
+do.load:
+  %loaded = load <4 x float>, ptr addrspace(1) %my.x
+  br label %post.load
+post.load:
+  %maybe.loaded = phi <4 x float> [ %loaded, %do.load ], [ zeroinitializer, %entry ]
+  %my.y = getelementptr <4 x float>, ptr addrspace(1) %y, i64 %tid.ext
+  %store.cond = icmp ult i32 %j, 15
+  br i1 %store.cond, label %do.store, label %exit
+do.store:
+  store <4 x float> %maybe.loaded, ptr addrspace(1) %my.y
+  br label %exit
+exit:
+  ret void
+}
+
+attributes #0 = {"uniform-work-group-size"="true" "amdgpu-flat-work-group-size"="256,256"}
+!0 = !{i32 64, i32 4, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index e77f4f69e265bb..65bb9a9652b17e 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -87,6 +87,7 @@
 ; GCN-O0-NEXT:        Cycle Info Analysis
 ; GCN-O0-NEXT:        Uniformity Analysis
 ; GCN-O0-NEXT:        AMDGPU Rewrite Undef for PHI
+; GCN-O0-NEXT:        AMDGPU Annotate Varying Branch Weights
 ; GCN-O0-NEXT:        LCSSA Verifier
 ; GCN-O0-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O0-NEXT:      DummyCGSCCPass
@@ -279,6 +280,7 @@
 ; GCN-O1-NEXT:        Cycle Info Analysis
 ; GCN-O1-NEXT:        Uniformity Analysis
 ; GCN-O1-NEXT:        AMDGPU Rewrite Undef for PHI
+; GCN-O1-NEXT:        AMDGPU Annotate Varying Branch Weights
 ; GCN-O1-NEXT:        LCSSA Verifier
 ; GCN-O1-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O1-NEXT:      DummyCGSCCPass
@@ -579,6 +581,7 @@
 ; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
 ; GCN-O1-OPTS-NEXT:        Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:        AMDGPU Rewrite Undef for PHI
+; GCN-O1-OPTS-NEXT:        AMDGPU Annotate Varying Branch Weights
 ; GCN-O1-OPTS-NEXT:        LCSSA Verifier
 ; GCN-O1-OPTS-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O1-OPTS-NEXT:      DummyCGSCCPass
@@ -891,6 +894,7 @@
 ; GCN-O2-NEXT:        Cycle Info Analysis
 ; GCN-O2-NEXT:        Uniformity Analysis
 ; GCN-O2-NEXT:        AMDGPU Rewrite Undef for PHI
+; GCN-O2-NEXT:        AMDGPU Annotate Varying Branch Weights
 ; GCN-O2-NEXT:        LCSSA Verifier
 ; GCN-O2-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O2-NEXT:      Analysis if a function is memory bound
@@ -1218,6 +1222,7 @@
 ; GCN-O3-NEXT:        Cycle Info Analysis
 ; GCN-O3-NEXT:        Uniformity Analysis
 ; GCN-O3-NEXT:        AMDGPU Rewrite Undef for PHI
+; GCN-O3-NEXT:        AMDGPU Annotate Varying Branch Weights
 ; GCN-O3-NEXT:        LCSSA Verifier
 ; GCN-O3-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O3-NEXT:      Analysis if a function is memory bound
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index c826980991f94f..5db6dd8d64d283 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -146,8 +146,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_mov_b32_e32 v47, 0
 ; CHECK-NEXT:    s_mov_b32 s47, 0
 ; CHECK-NEXT:  .LBB0_5: ; =>This Loop Header: Depth=1
-; CHECK-NEXT:    ; Child Loop BB0_8 Depth 2
-; CHECK-NEXT:    ; Child Loop BB0_20 Depth 2
+; CHECK-NEXT:    ; Child Loop BB0_7 Depth 2
+; CHECK-NEXT:    ; Child Loop BB0_19 Depth 2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, s47, v44
 ; CHECK-NEXT:    s_lshl_b32 s4, s47, 5
 ; CHECK-NEXT:    s_add_i32 s46, s47, 1
@@ -163,20 +163,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_mov_b32 s53, 0
 ; CHECK-NEXT:    s_mov_b32 s56, 0
-; CHECK-NEXT:    s_branch .LBB0_8
-; CHECK-NEXT:  .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
-; CHECK-NEXT:    s_add_i32 s56, s56, 4
-; CHECK-NEXT:    s_add_i32 s4, s47, s56
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, s56, v57
-; CHECK-NEXT:    s_add_i32 s5, s4, 5
-; CHECK-NEXT:    s_add_i32 s4, s4, 1
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s5, v42
-; CHECK-NEXT:    v_mov_b32_e32 v58, s4
-; CHECK-NEXT:    s_or_b32 s53, vcc_lo, s53
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s53
-; CHECK-NEXT:    s_cbranch_execz .LBB0_16
-; CHECK-NEXT:  .LBB0_8: ; Parent Loop BB0_5 Depth=1
+; CHECK-NEXT:  .LBB0_7: ; Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v59, s56, v46
 ; CHECK-NEXT:    v_add_nc_u32_e32 v58, s56, v57
@@ -184,8 +171,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_10
-; CHECK-NEXT:  ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_9
+; CHECK-NEXT:  ; %bb.8: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -203,14 +190,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v58
-; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:  .LBB0_9: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_12
-; CHECK-NEXT:  ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_11
+; CHECK-NEXT:  ; %bb.10: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -229,14 +216,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v60
-; CHECK-NEXT:  .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:  .LBB0_11: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_14
-; CHECK-NEXT:  ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_13
+; CHECK-NEXT:  ; %bb.12: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -255,14 +242,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v60
-; CHECK-NEXT:  .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:  .LBB0_13: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_7
-; CHECK-NEXT:  ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_15
+; CHECK-NEXT:  ; %bb.14: ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -281,8 +268,19 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v58
-; CHECK-NEXT:    s_branch .LBB0_7
-; CHECK-NEXT:  .LBB0_16: ; %Flow45
+; CHECK-NEXT:  .LBB0_15: ; in Loop: Header=BB0_7 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT:    s_add_i32 s56, s56, 4
+; CHECK-NEXT:    s_add_i32 s4, s47, s56
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, s56, v57
+; CHECK-NEXT:    s_add_i32 s5, s4, 5
+; CHECK-NEXT:    s_add_i32 s4, s4, 1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s5, v42
+; CHECK-NEXT:    v_mov_b32_e32 v58, s4
+; CHECK-NEXT:    s_or_b32 s53, vcc_lo, s53
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s53
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_7
+; CHECK-NEXT:  ; %bb.16: ; %Flow45
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
 ; CHECK-NEXT:    v_mov_b32_e32 v57, v0
@@ -296,25 +294,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_mov_b32 s52, 0
 ; CHECK-NEXT:    s_inst_prefetch 0x1
-; CHECK-NEXT:    s_branch .LBB0_20
 ; CHECK-NEXT:    .p2align 6
-; CHECK-NEXT:  .LBB0_19: ; in Loop: Header=BB0_20 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
-; CHECK-NEXT:    v_add_nc_u32_e32 v58, 1, v58
-; CHECK-NEXT:    v_add_nc_u32_e32 v57, 1, v57
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v58, v42
-; CHECK-NEXT:    s_or_b32 s52, vcc_lo, s52
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s52
-; CHECK-NEXT:    s_cbranch_execz .LBB0_22
-; CHECK-NEXT:  .LBB0_20: ; Parent Loop BB0_5 Depth=1
+; CHECK-NEXT:  .LBB0_19: ; Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v58
 ; CHECK-NEXT:    ds_read_u8 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s53, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_19
-; CHECK-NEXT:  ; %bb.21: ; in Loop: Header=BB0_20 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_21
+; CHECK-NEXT:  ; %bb.20: ; in Loop: Header=BB0_19 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -332,8 +321,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v57
-; CHECK-NEXT:    s_branch .LBB0_19
-; CHECK-NEXT:  .LBB0_22: ; %Flow43
+; CHECK-NEXT:  .LBB0_21: ; in Loop: Header=BB0_19 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
+; CHECK-NEXT:    v_add_nc_u32_e32 v58, 1, v58
+; CHECK-NEXT:    v_add_nc_u32_e32 v57, 1, v57
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v58, v42
+; CHECK-NEXT:    s_or_b32 s52, vcc_lo, s52
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s52
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_19
+; CHECK-NEXT:  ; %bb.22: ; %Flow43
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_inst_prefetch 0x2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s52
@@ -374,29 +370,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_33
 ; CHECK-NEXT:  ; %bb.26:
 ; CHECK-NEXT:    s_mov_b32 s44, 0
-; CHECK-NEXT:    s_branch .LBB0_28
-; CHECK-NEXT:  .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s45
-; CHECK-NEXT:    v_mov_b32_e32 v31, v40
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_add_u32 s8, s34, 40
-; CHECK-NEXT:    s_addc_u32 s9, s35, 0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
-; CHECK-NEXT:    s_mov_b32 s14, s33
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, _Z14get_local_sizej at rel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, _Z14get_local_sizej at rel32@hi+12
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_add_co_u32 v41, vcc_lo, v0, v41
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v47, v41
-; CHECK-NEXT:    s_or_b32 s44, vcc_lo, s44
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; CHECK-NEXT:    s_cbranch_execz .LBB0_33
-; CHECK-NEXT:  .LBB0_28: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:  .LBB0_27: ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v41
 ; CHECK-NEXT:    s_mov_b32 s45, exec_lo
 ; CHECK-NEXT:    ds_read_b32 v0, v0
@@ -424,8 +398,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_or_b32_e32 v5, v46, v57
 ; CHECK-NEXT:    v_or_b32_e32 v4, v45, v56
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
-; CHECK-NEXT:    s_cbranch_execz .LBB0_27
-; CHECK-NEXT:  ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_32
+; CHECK-NEXT:  ; %bb.28: ; in Loop: Header=BB0_27 Depth=1
 ; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    global_load_dwordx2 v[58:59], v[2:3], off offset:24
 ; CHECK-NEXT:    global_load_dwordx2 v[60:61], v[0:1], off offset:24
@@ -465,8 +439,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_mov_b32 s4, exec_lo
 ; CHECK-NEXT:    v_cmpx_gt_u32_e32 12, v0
 ; CHECK-NEXT:    s_xor_b32 s4, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_31
-; CHECK-NEXT:  ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_30
+; CHECK-NEXT:  ; %bb.29: ; in Loop: Header=BB0_27 Depth=1
 ; CHECK-NEXT:    v_xor_b32_e32 v4, v60, v58
 ; CHECK-NEXT:    v_lshrrev_b64 v[2:3], 16, v[56:57]
 ; CHECK-NEXT:    v_mad_u64_u32 v[6:7], null, 0x180, v73, s[50:51]
@@ -489,11 +463,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    global_store_dword v[6:7], v8, off offset:4
 ; CHECK-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off offset:8
 ; CHECK-NEXT:    global_store_dwordx2 v[6:7], v[4:5], off offset:24
-; CHECK-NEXT:  .LBB0_31: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:  .LBB0_30: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_27 Depth=1
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s4, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB0_27
-; CHECK-NEXT:  ; %bb.32: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_32
+; CHECK-NEXT:  ; %bb.31: ; in Loop: Header=BB0_27 Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v42
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v43
@@ -510,7 +484,27 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z10atomic_subPU3AS1Vjj at rel32@hi+12
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    s_branch .LBB0_27
+; CHECK-NEXT:  .LBB0_32: ; in Loop: Header=BB0_27 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s45
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s43
+; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, _Z14get_local_sizej at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, _Z14get_local_sizej at rel32@hi+12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_add_co_u32 v41, vcc_lo, v0, v41
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v47, v41
+; CHECK-NEXT:    s_or_b32 s44, vcc_lo, s44
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_27
 ; CHECK-NEXT:  .LBB0_33:
 ; CHECK-NEXT:    s_endpgm
   %6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
@@ -875,7 +869,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:  .LBB1_1: ; %.37
 ; CHECK-NEXT:    ; =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ; Child Loop BB1_3 Depth 2
-; CHECK-NEXT:    ; Child Loop BB1_8 Depth 2
+; CHECK-NEXT:    ; Child Loop BB1_7 Depth 2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, s4, v44
 ; CHECK-NEXT:    s_lshl_b32 s5, s4, 5
 ; CHECK-NEXT:    s_add_i32 s45, s4, 1
@@ -919,18 +913,8 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_mov_b32 s47, 0
 ; CHECK-NEXT:    s_inst_prefetch 0x1
-; CHECK-NEXT:    s_branch .LBB1_8
 ; CHECK-NEXT:    .p2align 6
-; CHECK-NEXT:  .LBB1_7: ; %.114
-; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s48
-; CHECK-NEXT:    v_add_nc_u32_e32 v56, 1, v56
-; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v56, v41
-; CHECK-NEXT:    s_or_b32 s47, vcc_lo, s47
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s47
-; CHECK-NEXT:    s_cbranch_execz .LBB1_10
-; CHECK-NEXT:  .LBB1_8: ; %.103
+; CHECK-NEXT:  .LBB1_7: ; %.103
 ; CHECK-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v56
@@ -938,9 +922,9 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s48, s4
-; CHECK-NEXT:    s_cbranch_execz .LBB1_7
-; CHECK-NEXT:  ; %bb.9: ; %.110
-; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB1_9
+; CHECK-NEXT:  ; %bb.8: ; %.110
+; CHECK-NEXT:    ; in Loop: Header=BB1_7 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; CHECK-NEXT:    s_add_u32 s8, s38, 40
@@ -958,8 +942,16 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v47
-; CHECK-NEXT:    s_branch .LBB1_7
-; CHECK-NEXT:  .LBB1_10: ; %Flow
+; CHECK-NEXT:  .LBB1_9: ; %.114
+; CHECK-NEXT:    ; in Loop: Header=BB1_7 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s48
+; CHECK-NEXT:    v_add_nc_u32_e32 v56, 1, v56
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v56, v41
+; CHECK-NEXT:    s_or_b32 s47, vcc_lo, s47
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s47
+; CHECK-NEXT:    s_cbranch_execnz .LBB1_7
+; CHECK-NEXT:  ; %bb.10: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_inst_prefetch 0x2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s47
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index d122e8a21671be..2970b820dcd7d1 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -142,6 +142,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUCodeGenPrepare.cpp",
     "AMDGPUCombinerHelper.cpp",
     "AMDGPUCtorDtorLowering.cpp",
+    "AMDGPUEXECValueAnalysis.cpp",
     "AMDGPUExportClustering.cpp",
     "AMDGPUFrameLowering.cpp",
     "AMDGPUGlobalISelDivergenceLowering.cpp",