[llvm] [CodeGen] Add OffloadBlockUniformityAnalysis for offload PGO (PR #178417)

Yaxun Liu via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 28 07:03:42 PST 2026


https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/178417

>From ffe44a5be52641dda5ede8a25ba5f5060435ddf6 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Tue, 27 Jan 2026 16:42:05 -0500
Subject: [PATCH 1/3] [CodeGen] Add OffloadBlockUniformityAnalysis for offload
 PGO

Add a target-agnostic MachineFunction analysis that propagates per-block
uniformity information from IR metadata to codegen passes.

The analysis reads "offload-block-uniformity" metadata attached to IR
BasicBlock terminators during PGO-use. This metadata is produced by the
offload PGO infrastructure when profile data includes uniformity bits.

SpillPlacement consumes this analysis to flatten block frequencies for
divergent blocks, preventing PGO-guided spill placement from causing
performance regressions on SIMT architectures where "cold" divergent
paths still execute with partial wave occupancy.

Key components:
- OffloadBlockUniformityInfo: Stores per-MBB divergence classification
- OffloadBlockUniformityAnalysis: MachineFunctionAnalysis wrapper
- SpillPlacement integration: Queries analysis for divergent blocks

This is independent of the core offload PGO infrastructure - if no
metadata exists, the analysis reports hasUniformity()=false and
SpillPlacement behaves normally.
---
 .../llvm/CodeGen/OffloadBlockUniformity.h     | 65 ++++++++++++++
 llvm/include/llvm/CodeGen/SpillPlacement.h    |  4 +-
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |  1 +
 .../llvm/Passes/MachinePassRegistry.def       |  2 +
 llvm/lib/CodeGen/CMakeLists.txt               |  1 +
 llvm/lib/CodeGen/OffloadBlockUniformity.cpp   | 84 +++++++++++++++++++
 llvm/lib/CodeGen/SpillPlacement.cpp           | 24 ++++--
 llvm/lib/Passes/PassBuilder.cpp               |  1 +
 8 files changed, 176 insertions(+), 6 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
 create mode 100644 llvm/lib/CodeGen/OffloadBlockUniformity.cpp

diff --git a/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h b/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
new file mode 100644
index 0000000000000..31e57758a23a0
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
@@ -0,0 +1,65 @@
+//===- OffloadBlockUniformity.h - Offload block uniformity info -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provide per-(Machine)basic-block uniformity information for offload profiles.
+//
+// The source of truth is IR metadata attached during PGO use:
+//   - Metadata name: "offload-block-uniformity"
+//   - Payload: i1 (true = uniform, false = divergent)
+//
+// This is intentionally target-agnostic: any offload backend that produces
+// uniformity bits in the profile can attach the same metadata and reuse this
+// analysis in codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_OFFLOADBLOCKUNIFORMITY_H
+#define LLVM_CODEGEN_OFFLOADBLOCKUNIFORMITY_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MachineBasicBlock;
+class MachineFunction;
+
+class OffloadBlockUniformityInfo {
+public:
+  static constexpr StringLiteral MetadataName = "offload-block-uniformity";
+
+  LLVM_ABI void compute(const MachineFunction &MF);
+
+  bool hasUniformity() const { return HasAnyUniformity; }
+
+  // Returns true if the block is considered divergent. If uniformity exists for
+  // the function but a block has no explicit annotation, it is treated as
+  // divergent (conservative).
+  LLVM_ABI bool isDivergent(const MachineBasicBlock &MBB) const;
+
+private:
+  bool HasAnyUniformity = false;
+  BitVector DivergentBlocks;
+};
+
+class OffloadBlockUniformityAnalysis
+    : public AnalysisInfoMixin<OffloadBlockUniformityAnalysis> {
+  friend AnalysisInfoMixin<OffloadBlockUniformityAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  using Result = OffloadBlockUniformityInfo;
+  LLVM_ABI Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_OFFLOADBLOCKUNIFORMITY_H
diff --git a/llvm/include/llvm/CodeGen/SpillPlacement.h b/llvm/include/llvm/CodeGen/SpillPlacement.h
index 1ef37f2718a65..490ebbb236efc 100644
--- a/llvm/include/llvm/CodeGen/SpillPlacement.h
+++ b/llvm/include/llvm/CodeGen/SpillPlacement.h
@@ -39,6 +39,7 @@ class BitVector;
 class EdgeBundles;
 class MachineBlockFrequencyInfo;
 class MachineFunction;
+class OffloadBlockUniformityInfo;
 class SpillPlacementWrapperLegacy;
 class SpillPlacementAnalysis;
 
@@ -169,7 +170,8 @@ class SpillPlacement {
   void releaseMemory();
 
   void run(MachineFunction &MF, EdgeBundles *Bundles,
-           MachineBlockFrequencyInfo *MBFI);
+           MachineBlockFrequencyInfo *MBFI,
+           const OffloadBlockUniformityInfo *Uniformity = nullptr);
   void activate(unsigned n);
   void setThreshold(BlockFrequency Entry);
 
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 6942fc42ca721..7930a58fcc290 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -60,6 +60,7 @@
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/MachineSink.h"
 #include "llvm/CodeGen/MachineVerifier.h"
+#include "llvm/CodeGen/OffloadBlockUniformity.h"
 #include "llvm/CodeGen/OptimizePHIs.h"
 #include "llvm/CodeGen/PEI.h"
 #include "llvm/CodeGen/PHIElimination.h"
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 20b066a2ead6d..ed56bd6acd3fc 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -81,6 +81,8 @@ MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree",
                           MachinePostDominatorTreeAnalysis())
 MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics", MachineTraceMetricsAnalysis())
 MACHINE_FUNCTION_ANALYSIS("machine-uniformity", MachineUniformityAnalysis())
+MACHINE_FUNCTION_ANALYSIS("offload-block-uniformity",
+                          OffloadBlockUniformityAnalysis())
 MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 MACHINE_FUNCTION_ANALYSIS("reaching-def", ReachingDefAnalysis())
 MACHINE_FUNCTION_ANALYSIS("regalloc-evict", RegAllocEvictionAdvisorAnalysis())
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index f26b2cb6fddf5..b358150569cb3 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -68,6 +68,7 @@ add_llvm_component_library(LLVMCodeGen
   FixupStatepointCallerSaved.cpp
   FuncletLayout.cpp
   MachineFunctionAnalysis.cpp
+  OffloadBlockUniformity.cpp
   GCMetadata.cpp
   GCMetadataPrinter.cpp
   GCRootLowering.cpp
diff --git a/llvm/lib/CodeGen/OffloadBlockUniformity.cpp b/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
new file mode 100644
index 0000000000000..13d772c08d92c
--- /dev/null
+++ b/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
@@ -0,0 +1,84 @@
+//===- OffloadBlockUniformity.cpp - Offload block uniformity info --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/OffloadBlockUniformity.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Metadata.h"
+#include <optional>
+
+using namespace llvm;
+
+static std::optional<bool> getIRBlockUniformity(const BasicBlock &BB) {
+  const Instruction *TI = BB.getTerminator();
+  if (!TI)
+    return std::nullopt;
+
+  MDNode *MD = TI->getMetadata(OffloadBlockUniformityInfo::MetadataName);
+  if (!MD)
+    return std::nullopt;
+
+  // Metadata format: !{i1 IsUniform} - structural validity assumed (verifier).
+  return mdconst::extract<ConstantInt>(MD->getOperand(0))->isOne();
+}
+
+void OffloadBlockUniformityInfo::compute(const MachineFunction &MF) {
+  HasAnyUniformity = false;
+  DivergentBlocks.clear();
+  DivergentBlocks.resize(MF.getNumBlockIDs());
+
+  // First determine whether any uniformity annotation exists for this function.
+  for (const MachineBasicBlock &MBB : MF) {
+    const BasicBlock *BB = MBB.getBasicBlock();
+    if (!BB)
+      continue;
+    if (getIRBlockUniformity(*BB).has_value()) {
+      HasAnyUniformity = true;
+      break;
+    }
+  }
+
+  if (!HasAnyUniformity)
+    return;
+
+  // Conservative behavior: if uniformity exists for the function but we cannot
+  // classify a particular (Machine)basic block, treat it as divergent.
+  for (const MachineBasicBlock &MBB : MF) {
+    const unsigned Num = MBB.getNumber();
+    bool IsDivergent = true;
+    if (const BasicBlock *BB = MBB.getBasicBlock()) {
+      if (auto U = getIRBlockUniformity(*BB))
+        IsDivergent = !*U;
+    }
+    if (Num < DivergentBlocks.size())
+      DivergentBlocks.set(Num, IsDivergent);
+  }
+}
+
+bool OffloadBlockUniformityInfo::isDivergent(
+    const MachineBasicBlock &MBB) const {
+  if (!HasAnyUniformity)
+    return false;
+  const unsigned Num = MBB.getNumber();
+  if (Num >= DivergentBlocks.size())
+    return true;
+  return DivergentBlocks.test(Num);
+}
+
+AnalysisKey OffloadBlockUniformityAnalysis::Key;
+
+OffloadBlockUniformityAnalysis::Result
+OffloadBlockUniformityAnalysis::run(MachineFunction &MF,
+                                    MachineFunctionAnalysisManager &) {
+  OffloadBlockUniformityInfo Info;
+  Info.compute(MF);
+  return Info;
+}
diff --git a/llvm/lib/CodeGen/SpillPlacement.cpp b/llvm/lib/CodeGen/SpillPlacement.cpp
index 55a96a22a00ec..fd7e9a1d9919c 100644
--- a/llvm/lib/CodeGen/SpillPlacement.cpp
+++ b/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -32,7 +32,10 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/OffloadBlockUniformity.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Function.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include <algorithm>
@@ -193,7 +196,9 @@ bool SpillPlacementWrapperLegacy::runOnMachineFunction(MachineFunction &MF) {
   auto *Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
   auto *MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
 
-  Impl.run(MF, Bundles, MBFI);
+  OffloadBlockUniformityInfo Uniformity;
+  Uniformity.compute(MF);
+  Impl.run(MF, Bundles, MBFI, &Uniformity);
   return false;
 }
 
@@ -204,8 +209,9 @@ SpillPlacementAnalysis::run(MachineFunction &MF,
                             MachineFunctionAnalysisManager &MFAM) {
   auto *Bundles = &MFAM.getResult<EdgeBundlesAnalysis>(MF);
   auto *MBFI = &MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
+  auto &Uniformity = MFAM.getResult<OffloadBlockUniformityAnalysis>(MF);
   SpillPlacement Impl;
-  Impl.run(MF, Bundles, MBFI);
+  Impl.run(MF, Bundles, MBFI, &Uniformity);
   return Impl;
 }
 
@@ -217,7 +223,8 @@ bool SpillPlacementAnalysis::Result::invalidate(
     return true;
   // Check dependencies.
   return Inv.invalidate<EdgeBundlesAnalysis>(MF, PA) ||
-         Inv.invalidate<MachineBlockFrequencyAnalysis>(MF, PA);
+         Inv.invalidate<MachineBlockFrequencyAnalysis>(MF, PA) ||
+         Inv.invalidate<OffloadBlockUniformityAnalysis>(MF, PA);
 }
 
 SpillPlacement::SpillPlacement() = default;
@@ -230,7 +237,8 @@ void SpillPlacement::releaseMemory() {
 }
 
 void SpillPlacement::run(MachineFunction &mf, EdgeBundles *Bundles,
-                         MachineBlockFrequencyInfo *MBFI) {
+                         MachineBlockFrequencyInfo *MBFI,
+                         const OffloadBlockUniformityInfo *Uniformity) {
   MF = &mf;
   this->bundles = Bundles;
   this->MBFI = MBFI;
@@ -240,12 +248,18 @@ void SpillPlacement::run(MachineFunction &mf, EdgeBundles *Bundles,
   TodoList.clear();
   TodoList.setUniverse(bundles->getNumBundles());
 
+  const bool HasUniformity = Uniformity && Uniformity->hasUniformity();
+
   // Compute total ingoing and outgoing block frequencies for all bundles.
   BlockFrequencies.resize(mf.getNumBlockIDs());
   setThreshold(MBFI->getEntryFreq());
   for (auto &I : mf) {
     unsigned Num = I.getNumber();
-    BlockFrequencies[Num] = MBFI->getBlockFreq(&I);
+    if (HasUniformity && Uniformity->isDivergent(I)) {
+      BlockFrequencies[Num] = MBFI->getEntryFreq();
+    } else {
+      BlockFrequencies[Num] = MBFI->getBlockFreq(&I);
+    }
   }
 }
 
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 8bb78c8c7df63..d541062b743d2 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -135,6 +135,7 @@
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
 #include "llvm/CodeGen/MachineVerifier.h"
+#include "llvm/CodeGen/OffloadBlockUniformity.h"
 #include "llvm/CodeGen/OptimizePHIs.h"
 #include "llvm/CodeGen/PEI.h"
 #include "llvm/CodeGen/PHIElimination.h"

>From 05e1805c3ce635148a3b7fe94d71044fc3cf4960 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Wed, 28 Jan 2026 08:52:48 -0500
Subject: [PATCH 2/3] [CodeGen] Fix BitVector::set bug and add test for
 OffloadBlockUniformity

Fix incorrect BitVector::set() usage: set(I, E) sets a range of bits,
not a single bit with a value. When IsDivergent=false, this caused
"Attempted to set backwards range!" assertion failure.

Also add codegen test for the OffloadBlockUniformityAnalysis that
verifies the analysis correctly reads offload-block-uniformity metadata.
---
 .../llvm/CodeGen/OffloadBlockUniformity.h     |  3 +-
 llvm/lib/CodeGen/OffloadBlockUniformity.cpp   |  4 +-
 .../AMDGPU/offload-block-uniformity.ll        | 98 +++++++++++++++++++
 3 files changed, 102 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll

diff --git a/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h b/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
index 31e57758a23a0..66821c1f61167 100644
--- a/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
+++ b/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
@@ -57,7 +57,8 @@ class OffloadBlockUniformityAnalysis
 
 public:
   using Result = OffloadBlockUniformityInfo;
-  LLVM_ABI Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM);
+  LLVM_ABI Result run(MachineFunction &MF,
+                      MachineFunctionAnalysisManager &MFAM);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/CodeGen/OffloadBlockUniformity.cpp b/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
index 13d772c08d92c..235e40a6e3982 100644
--- a/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
+++ b/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
@@ -58,8 +58,8 @@ void OffloadBlockUniformityInfo::compute(const MachineFunction &MF) {
       if (auto U = getIRBlockUniformity(*BB))
         IsDivergent = !*U;
     }
-    if (Num < DivergentBlocks.size())
-      DivergentBlocks.set(Num, IsDivergent);
+    if (Num < DivergentBlocks.size() && IsDivergent)
+      DivergentBlocks.set(Num);
   }
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll b/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll
new file mode 100644
index 0000000000000..60a1864020251
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll
@@ -0,0 +1,98 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=greedy < %s | FileCheck %s
+
+; Test that OffloadBlockUniformityAnalysis correctly reads offload-block-uniformity
+; metadata from IR basic blocks and propagates uniformity information to codegen.
+;
+; This metadata is attached during PGO-use phase to indicate whether a basic block
+; was executed uniformly (all lanes together) or divergently (partial wave).
+;
+; The analysis is consumed by SpillPlacement to flatten block frequencies for
+; divergent blocks, preventing PGO from causing regressions on divergent code paths.
+
+; CHECK-LABEL: name: uniform_blocks
+; CHECK: body:
+define amdgpu_kernel void @uniform_blocks(ptr addrspace(1) %out, i32 %cond) #0 {
+entry:
+  %cmp = icmp sgt i32 %cond, 0
+  ; Entry block is uniform - all threads enter together
+  br i1 %cmp, label %if.then, label %if.end, !offload-block-uniformity !0
+
+if.then:
+  ; This block is uniform - all threads that enter, enter together
+  store i32 1, ptr addrspace(1) %out, align 4
+  br label %if.end, !offload-block-uniformity !0
+
+if.end:
+  ; Exit block is uniform
+  ret void, !offload-block-uniformity !0
+}
+
+; CHECK-LABEL: name: divergent_blocks
+; CHECK: body:
+define amdgpu_kernel void @divergent_blocks(ptr addrspace(1) %out, i32 %tid) #0 {
+entry:
+  %cmp = icmp eq i32 %tid, 0
+  ; Entry is uniform, but branch target is divergent
+  br i1 %cmp, label %if.then, label %if.end, !offload-block-uniformity !0
+
+if.then:
+  ; This block is divergent - only some threads enter
+  store i32 1, ptr addrspace(1) %out, align 4
+  br label %if.end, !offload-block-uniformity !1
+
+if.end:
+  ; Exit block is uniform (reconverged)
+  ret void, !offload-block-uniformity !0
+}
+
+; CHECK-LABEL: name: mixed_uniformity
+; CHECK: body:
+define amdgpu_kernel void @mixed_uniformity(ptr addrspace(1) %out, i32 %cond, i32 %tid) #0 {
+entry:
+  %cmp1 = icmp sgt i32 %cond, 0
+  br i1 %cmp1, label %outer.then, label %exit, !offload-block-uniformity !0
+
+outer.then:
+  ; Uniform outer branch
+  %cmp2 = icmp eq i32 %tid, 0
+  br i1 %cmp2, label %inner.then, label %inner.else, !offload-block-uniformity !0
+
+inner.then:
+  ; Divergent inner branch - only lane 0
+  store i32 1, ptr addrspace(1) %out, align 4
+  br label %merge, !offload-block-uniformity !1
+
+inner.else:
+  ; Divergent - other lanes
+  store i32 2, ptr addrspace(1) %out, align 4
+  br label %merge, !offload-block-uniformity !1
+
+merge:
+  ; Reconverged - uniform again
+  br label %exit, !offload-block-uniformity !0
+
+exit:
+  ret void, !offload-block-uniformity !0
+}
+
+; CHECK-LABEL: name: no_uniformity_metadata
+; CHECK: body:
+define amdgpu_kernel void @no_uniformity_metadata(ptr addrspace(1) %out, i32 %cond) #0 {
+entry:
+  ; No uniformity metadata - analysis should report hasUniformity() = false
+  %cmp = icmp sgt i32 %cond, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 1, ptr addrspace(1) %out, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+
+; Metadata: i1 true = uniform, i1 false = divergent
+!0 = !{i1 true}   ; uniform
+!1 = !{i1 false}  ; divergent

>From 80ce6f6a53b38ee50d7c69df8f119cf3f4e9ac18 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Wed, 28 Jan 2026 10:03:22 -0500
Subject: [PATCH 3/3] [CodeGen] Add offload uniformity printer and stronger
 test

Expose offload block uniformity classification via a printer pass so
codegen tests can assert uniform, divergent, and missing-metadata cases.
Update the AMDGPU test to validate the printed classifications.
---
 .../llvm/CodeGen/OffloadBlockUniformity.h     | 15 ++++
 .../llvm/Passes/MachinePassRegistry.def       |  2 +
 llvm/lib/CodeGen/OffloadBlockUniformity.cpp   | 33 ++++++++
 .../AMDGPU/offload-block-uniformity.ll        | 81 +++++++------------
 4 files changed, 81 insertions(+), 50 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h b/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
index 66821c1f61167..148c38443c35d 100644
--- a/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
+++ b/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
@@ -25,12 +25,14 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
 
 class MachineBasicBlock;
 class MachineFunction;
+class raw_ostream;
 
 class OffloadBlockUniformityInfo {
 public:
@@ -45,6 +47,8 @@ class OffloadBlockUniformityInfo {
   // divergent (conservative).
   LLVM_ABI bool isDivergent(const MachineBasicBlock &MBB) const;
 
+  LLVM_ABI void print(raw_ostream &OS, const MachineFunction &MF) const;
+
 private:
   bool HasAnyUniformity = false;
   BitVector DivergentBlocks;
@@ -61,6 +65,17 @@ class OffloadBlockUniformityAnalysis
                       MachineFunctionAnalysisManager &MFAM);
 };
 
+class OffloadBlockUniformityPrinterPass
+    : public PassInfoMixin<OffloadBlockUniformityPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit OffloadBlockUniformityPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+  static bool isRequired() { return true; }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_OFFLOADBLOCKUNIFORMITY_H
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index ed56bd6acd3fc..48d859abfa3b2 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -154,6 +154,8 @@ MACHINE_FUNCTION_PASS("print<machine-post-dom-tree>",
                       MachinePostDominatorTreePrinterPass(errs()))
 MACHINE_FUNCTION_PASS("print<machine-uniformity>",
                       MachineUniformityPrinterPass(errs()))
+MACHINE_FUNCTION_PASS("print<offload-block-uniformity>",
+                      OffloadBlockUniformityPrinterPass(errs()))
 MACHINE_FUNCTION_PASS("print<reaching-def>", ReachingDefPrinterPass(errs()))
 MACHINE_FUNCTION_PASS("print<slot-indexes>", SlotIndexesPrinterPass(errs()))
 MACHINE_FUNCTION_PASS("print<virtregmap>", VirtRegMapPrinterPass(errs()))
diff --git a/llvm/lib/CodeGen/OffloadBlockUniformity.cpp b/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
index 235e40a6e3982..868ff3d20dfe7 100644
--- a/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
+++ b/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/Support/raw_ostream.h"
 #include <optional>
 
 using namespace llvm;
@@ -63,6 +64,30 @@ void OffloadBlockUniformityInfo::compute(const MachineFunction &MF) {
   }
 }
 
+void OffloadBlockUniformityInfo::print(raw_ostream &OS,
+                                       const MachineFunction &MF) const {
+  OS << "OffloadBlockUniformityInfo for function: ";
+  MF.getFunction().printAsOperand(OS, /*PrintType=*/false);
+  OS << '\n';
+  OS << "HasUniformity: " << (HasAnyUniformity ? "true" : "false") << '\n';
+  if (!HasAnyUniformity)
+    return;
+
+  for (const MachineBasicBlock &MBB : MF) {
+    const BasicBlock *BB = MBB.getBasicBlock();
+    if (!BB)
+      continue;
+    OS << "  BLOCK bb." << MBB.getNumber();
+    if (BB->hasName())
+      OS << " (%" << BB->getName() << ")";
+    if (auto U = getIRBlockUniformity(*BB)) {
+      OS << ": " << (*U ? "uniform" : "divergent") << '\n';
+      continue;
+    }
+    OS << ": no-metadata (treated divergent)\n";
+  }
+}
+
 bool OffloadBlockUniformityInfo::isDivergent(
     const MachineBasicBlock &MBB) const {
   if (!HasAnyUniformity)
@@ -82,3 +107,11 @@ OffloadBlockUniformityAnalysis::run(MachineFunction &MF,
   Info.compute(MF);
   return Info;
 }
+
+PreservedAnalyses
+OffloadBlockUniformityPrinterPass::run(MachineFunction &MF,
+                                       MachineFunctionAnalysisManager &MFAM) {
+  auto &Info = MFAM.getResult<OffloadBlockUniformityAnalysis>(MF);
+  Info.print(OS, MF);
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll b/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll
index 60a1864020251..0014c65062ef0 100644
--- a/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll
+++ b/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll
@@ -1,7 +1,8 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=greedy < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -stop-after=finalize-isel -o - %s | \
+; RUN:   llc -mtriple=amdgcn-amd-amdhsa -passes='print<offload-block-uniformity>' -x mir -filetype=null 2>&1 | FileCheck %s
 
 ; Test that OffloadBlockUniformityAnalysis correctly reads offload-block-uniformity
-; metadata from IR basic blocks and propagates uniformity information to codegen.
+; metadata from IR basic blocks and classifies machine blocks.
 ;
 ; This metadata is attached during PGO-use phase to indicate whether a basic block
 ; was executed uniformly (all lanes together) or divergently (partial wave).
@@ -9,85 +10,65 @@
 ; The analysis is consumed by SpillPlacement to flatten block frequencies for
 ; divergent blocks, preventing PGO from causing regressions on divergent code paths.
 
-; CHECK-LABEL: name: uniform_blocks
-; CHECK: body:
-define amdgpu_kernel void @uniform_blocks(ptr addrspace(1) %out, i32 %cond) #0 {
+; CHECK-LABEL: OffloadBlockUniformityInfo for function: @uniform_blocks
+; CHECK-NEXT: HasUniformity: true
+; CHECK: BLOCK bb.{{[0-9]+}} (%entry): uniform
+define amdgpu_kernel void @uniform_blocks(ptr addrspace(1) %out) #0 {
 entry:
-  %cmp = icmp sgt i32 %cond, 0
-  ; Entry block is uniform - all threads enter together
-  br i1 %cmp, label %if.then, label %if.end, !offload-block-uniformity !0
-
-if.then:
-  ; This block is uniform - all threads that enter, enter together
   store i32 1, ptr addrspace(1) %out, align 4
-  br label %if.end, !offload-block-uniformity !0
-
-if.end:
-  ; Exit block is uniform
   ret void, !offload-block-uniformity !0
 }
 
-; CHECK-LABEL: name: divergent_blocks
-; CHECK: body:
+; CHECK-LABEL: OffloadBlockUniformityInfo for function: @divergent_blocks
+; CHECK-NEXT: HasUniformity: true
+; CHECK-DAG: BLOCK bb.{{[0-9]+}} (%if.then): divergent
+; CHECK-DAG: BLOCK bb.{{[0-9]+}} (%if.else): uniform
 define amdgpu_kernel void @divergent_blocks(ptr addrspace(1) %out, i32 %tid) #0 {
 entry:
   %cmp = icmp eq i32 %tid, 0
-  ; Entry is uniform, but branch target is divergent
-  br i1 %cmp, label %if.then, label %if.end, !offload-block-uniformity !0
+  br i1 %cmp, label %if.then, label %if.else
 
 if.then:
-  ; This block is divergent - only some threads enter
   store i32 1, ptr addrspace(1) %out, align 4
-  br label %if.end, !offload-block-uniformity !1
+  ret void, !offload-block-uniformity !1
 
-if.end:
-  ; Exit block is uniform (reconverged)
+if.else:
+  store i32 2, ptr addrspace(1) %out, align 4
   ret void, !offload-block-uniformity !0
 }
 
-; CHECK-LABEL: name: mixed_uniformity
-; CHECK: body:
-define amdgpu_kernel void @mixed_uniformity(ptr addrspace(1) %out, i32 %cond, i32 %tid) #0 {
+; CHECK-LABEL: OffloadBlockUniformityInfo for function: @missing_metadata
+; CHECK-NEXT: HasUniformity: true
+; CHECK-DAG: BLOCK bb.{{[0-9]+}} (%if.then): no-metadata (treated divergent)
+; CHECK-DAG: BLOCK bb.{{[0-9]+}} (%if.else): uniform
+define amdgpu_kernel void @missing_metadata(ptr addrspace(1) %out, i32 %cond) #0 {
 entry:
-  %cmp1 = icmp sgt i32 %cond, 0
-  br i1 %cmp1, label %outer.then, label %exit, !offload-block-uniformity !0
-
-outer.then:
-  ; Uniform outer branch
-  %cmp2 = icmp eq i32 %tid, 0
-  br i1 %cmp2, label %inner.then, label %inner.else, !offload-block-uniformity !0
+  %cmp = icmp sgt i32 %cond, 0
+  br i1 %cmp, label %if.then, label %if.else
 
-inner.then:
-  ; Divergent inner branch - only lane 0
+if.then:
   store i32 1, ptr addrspace(1) %out, align 4
-  br label %merge, !offload-block-uniformity !1
+  ret void
 
-inner.else:
-  ; Divergent - other lanes
+if.else:
   store i32 2, ptr addrspace(1) %out, align 4
-  br label %merge, !offload-block-uniformity !1
-
-merge:
-  ; Reconverged - uniform again
-  br label %exit, !offload-block-uniformity !0
-
-exit:
   ret void, !offload-block-uniformity !0
 }
 
-; CHECK-LABEL: name: no_uniformity_metadata
-; CHECK: body:
+; CHECK-LABEL: OffloadBlockUniformityInfo for function: @no_uniformity_metadata
+; CHECK-NEXT: HasUniformity: false
 define amdgpu_kernel void @no_uniformity_metadata(ptr addrspace(1) %out, i32 %cond) #0 {
 entry:
   ; No uniformity metadata - analysis should report hasUniformity() = false
   %cmp = icmp sgt i32 %cond, 0
-  br i1 %cmp, label %if.then, label %if.end
+  br i1 %cmp, label %if.then, label %if.else
 
 if.then:
   store i32 1, ptr addrspace(1) %out, align 4
-  br label %if.end
+  ret void
 
-if.end:
+if.else:
+  store i32 2, ptr addrspace(1) %out, align 4
   ret void
 }
 



More information about the llvm-commits mailing list