[llvm] [CodeGen] Add OffloadBlockUniformityAnalysis for offload PGO (PR #178417)
Yaxun Liu via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 28 07:03:42 PST 2026
https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/178417
>From ffe44a5be52641dda5ede8a25ba5f5060435ddf6 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Tue, 27 Jan 2026 16:42:05 -0500
Subject: [PATCH 1/3] [CodeGen] Add OffloadBlockUniformityAnalysis for offload
PGO
Add a target-agnostic MachineFunction analysis that propagates per-block
uniformity information from IR metadata to codegen passes.
The analysis reads "offload-block-uniformity" metadata attached to IR
BasicBlock terminators during PGO-use. This metadata is produced by the
offload PGO infrastructure when profile data includes uniformity bits.
SpillPlacement consumes this analysis to flatten block frequencies for
divergent blocks, preventing PGO-guided spill placement from causing
performance regressions on SIMT architectures where "cold" divergent
paths still execute with partial wave occupancy.
Key components:
- OffloadBlockUniformityInfo: Stores per-MBB divergence classification
- OffloadBlockUniformityAnalysis: MachineFunctionAnalysis wrapper
- SpillPlacement integration: Queries analysis for divergent blocks
This is independent of the core offload PGO infrastructure - if no
metadata exists, the analysis reports hasUniformity()=false and
SpillPlacement behaves normally.
---
.../llvm/CodeGen/OffloadBlockUniformity.h | 65 ++++++++++++++
llvm/include/llvm/CodeGen/SpillPlacement.h | 4 +-
llvm/include/llvm/Passes/CodeGenPassBuilder.h | 1 +
.../llvm/Passes/MachinePassRegistry.def | 2 +
llvm/lib/CodeGen/CMakeLists.txt | 1 +
llvm/lib/CodeGen/OffloadBlockUniformity.cpp | 84 +++++++++++++++++++
llvm/lib/CodeGen/SpillPlacement.cpp | 24 ++++--
llvm/lib/Passes/PassBuilder.cpp | 1 +
8 files changed, 176 insertions(+), 6 deletions(-)
create mode 100644 llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
create mode 100644 llvm/lib/CodeGen/OffloadBlockUniformity.cpp
diff --git a/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h b/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
new file mode 100644
index 0000000000000..31e57758a23a0
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
@@ -0,0 +1,65 @@
+//===- OffloadBlockUniformity.h - Offload block uniformity info -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provide per-(Machine)basic-block uniformity information for offload profiles.
+//
+// The source of truth is IR metadata attached during PGO use:
+// - Metadata name: "offload-block-uniformity"
+// - Payload: i1 (true = uniform, false = divergent)
+//
+// This is intentionally target-agnostic: any offload backend that produces
+// uniformity bits in the profile can attach the same metadata and reuse this
+// analysis in codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_OFFLOADBLOCKUNIFORMITY_H
+#define LLVM_CODEGEN_OFFLOADBLOCKUNIFORMITY_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MachineBasicBlock;
+class MachineFunction;
+
+class OffloadBlockUniformityInfo {
+public:
+ static constexpr StringLiteral MetadataName = "offload-block-uniformity";
+
+ LLVM_ABI void compute(const MachineFunction &MF);
+
+ bool hasUniformity() const { return HasAnyUniformity; }
+
+ // Returns true if the block is considered divergent. If uniformity exists for
+ // the function but a block has no explicit annotation, it is treated as
+ // divergent (conservative).
+ LLVM_ABI bool isDivergent(const MachineBasicBlock &MBB) const;
+
+private:
+ bool HasAnyUniformity = false;
+ BitVector DivergentBlocks;
+};
+
+class OffloadBlockUniformityAnalysis
+ : public AnalysisInfoMixin<OffloadBlockUniformityAnalysis> {
+ friend AnalysisInfoMixin<OffloadBlockUniformityAnalysis>;
+ static AnalysisKey Key;
+
+public:
+ using Result = OffloadBlockUniformityInfo;
+ LLVM_ABI Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_OFFLOADBLOCKUNIFORMITY_H
diff --git a/llvm/include/llvm/CodeGen/SpillPlacement.h b/llvm/include/llvm/CodeGen/SpillPlacement.h
index 1ef37f2718a65..490ebbb236efc 100644
--- a/llvm/include/llvm/CodeGen/SpillPlacement.h
+++ b/llvm/include/llvm/CodeGen/SpillPlacement.h
@@ -39,6 +39,7 @@ class BitVector;
class EdgeBundles;
class MachineBlockFrequencyInfo;
class MachineFunction;
+class OffloadBlockUniformityInfo;
class SpillPlacementWrapperLegacy;
class SpillPlacementAnalysis;
@@ -169,7 +170,8 @@ class SpillPlacement {
void releaseMemory();
void run(MachineFunction &MF, EdgeBundles *Bundles,
- MachineBlockFrequencyInfo *MBFI);
+ MachineBlockFrequencyInfo *MBFI,
+ const OffloadBlockUniformityInfo *Uniformity = nullptr);
void activate(unsigned n);
void setThreshold(BlockFrequency Entry);
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 6942fc42ca721..7930a58fcc290 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -60,6 +60,7 @@
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/MachineSink.h"
#include "llvm/CodeGen/MachineVerifier.h"
+#include "llvm/CodeGen/OffloadBlockUniformity.h"
#include "llvm/CodeGen/OptimizePHIs.h"
#include "llvm/CodeGen/PEI.h"
#include "llvm/CodeGen/PHIElimination.h"
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 20b066a2ead6d..ed56bd6acd3fc 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -81,6 +81,8 @@ MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree",
MachinePostDominatorTreeAnalysis())
MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics", MachineTraceMetricsAnalysis())
MACHINE_FUNCTION_ANALYSIS("machine-uniformity", MachineUniformityAnalysis())
+MACHINE_FUNCTION_ANALYSIS("offload-block-uniformity",
+ OffloadBlockUniformityAnalysis())
MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
MACHINE_FUNCTION_ANALYSIS("reaching-def", ReachingDefAnalysis())
MACHINE_FUNCTION_ANALYSIS("regalloc-evict", RegAllocEvictionAdvisorAnalysis())
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index f26b2cb6fddf5..b358150569cb3 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -68,6 +68,7 @@ add_llvm_component_library(LLVMCodeGen
FixupStatepointCallerSaved.cpp
FuncletLayout.cpp
MachineFunctionAnalysis.cpp
+ OffloadBlockUniformity.cpp
GCMetadata.cpp
GCMetadataPrinter.cpp
GCRootLowering.cpp
diff --git a/llvm/lib/CodeGen/OffloadBlockUniformity.cpp b/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
new file mode 100644
index 0000000000000..13d772c08d92c
--- /dev/null
+++ b/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
@@ -0,0 +1,84 @@
+//===- OffloadBlockUniformity.cpp - Offload block uniformity info --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/OffloadBlockUniformity.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Metadata.h"
+#include <optional>
+
+using namespace llvm;
+
+static std::optional<bool> getIRBlockUniformity(const BasicBlock &BB) {
+ const Instruction *TI = BB.getTerminator();
+ if (!TI)
+ return std::nullopt;
+
+ MDNode *MD = TI->getMetadata(OffloadBlockUniformityInfo::MetadataName);
+ if (!MD)
+ return std::nullopt;
+
+ // Metadata format: !{i1 IsUniform} - structural validity assumed (verifier).
+ return mdconst::extract<ConstantInt>(MD->getOperand(0))->isOne();
+}
+
+void OffloadBlockUniformityInfo::compute(const MachineFunction &MF) {
+ HasAnyUniformity = false;
+ DivergentBlocks.clear();
+ DivergentBlocks.resize(MF.getNumBlockIDs());
+
+ // First determine whether any uniformity annotation exists for this function.
+ for (const MachineBasicBlock &MBB : MF) {
+ const BasicBlock *BB = MBB.getBasicBlock();
+ if (!BB)
+ continue;
+ if (getIRBlockUniformity(*BB).has_value()) {
+ HasAnyUniformity = true;
+ break;
+ }
+ }
+
+ if (!HasAnyUniformity)
+ return;
+
+ // Conservative behavior: if uniformity exists for the function but we cannot
+ // classify a particular (Machine)basic block, treat it as divergent.
+ for (const MachineBasicBlock &MBB : MF) {
+ const unsigned Num = MBB.getNumber();
+ bool IsDivergent = true;
+ if (const BasicBlock *BB = MBB.getBasicBlock()) {
+ if (auto U = getIRBlockUniformity(*BB))
+ IsDivergent = !*U;
+ }
+ if (Num < DivergentBlocks.size())
+ DivergentBlocks.set(Num, IsDivergent);
+ }
+}
+
+bool OffloadBlockUniformityInfo::isDivergent(
+ const MachineBasicBlock &MBB) const {
+ if (!HasAnyUniformity)
+ return false;
+ const unsigned Num = MBB.getNumber();
+ if (Num >= DivergentBlocks.size())
+ return true;
+ return DivergentBlocks.test(Num);
+}
+
+AnalysisKey OffloadBlockUniformityAnalysis::Key;
+
+OffloadBlockUniformityAnalysis::Result
+OffloadBlockUniformityAnalysis::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &) {
+ OffloadBlockUniformityInfo Info;
+ Info.compute(MF);
+ return Info;
+}
diff --git a/llvm/lib/CodeGen/SpillPlacement.cpp b/llvm/lib/CodeGen/SpillPlacement.cpp
index 55a96a22a00ec..fd7e9a1d9919c 100644
--- a/llvm/lib/CodeGen/SpillPlacement.cpp
+++ b/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -32,7 +32,10 @@
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/OffloadBlockUniformity.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Function.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include <algorithm>
@@ -193,7 +196,9 @@ bool SpillPlacementWrapperLegacy::runOnMachineFunction(MachineFunction &MF) {
auto *Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
auto *MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
- Impl.run(MF, Bundles, MBFI);
+ OffloadBlockUniformityInfo Uniformity;
+ Uniformity.compute(MF);
+ Impl.run(MF, Bundles, MBFI, &Uniformity);
return false;
}
@@ -204,8 +209,9 @@ SpillPlacementAnalysis::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
auto *Bundles = &MFAM.getResult<EdgeBundlesAnalysis>(MF);
auto *MBFI = &MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
+ auto &Uniformity = MFAM.getResult<OffloadBlockUniformityAnalysis>(MF);
SpillPlacement Impl;
- Impl.run(MF, Bundles, MBFI);
+ Impl.run(MF, Bundles, MBFI, &Uniformity);
return Impl;
}
@@ -217,7 +223,8 @@ bool SpillPlacementAnalysis::Result::invalidate(
return true;
// Check dependencies.
return Inv.invalidate<EdgeBundlesAnalysis>(MF, PA) ||
- Inv.invalidate<MachineBlockFrequencyAnalysis>(MF, PA);
+ Inv.invalidate<MachineBlockFrequencyAnalysis>(MF, PA) ||
+ Inv.invalidate<OffloadBlockUniformityAnalysis>(MF, PA);
}
SpillPlacement::SpillPlacement() = default;
@@ -230,7 +237,8 @@ void SpillPlacement::releaseMemory() {
}
void SpillPlacement::run(MachineFunction &mf, EdgeBundles *Bundles,
- MachineBlockFrequencyInfo *MBFI) {
+ MachineBlockFrequencyInfo *MBFI,
+ const OffloadBlockUniformityInfo *Uniformity) {
MF = &mf;
this->bundles = Bundles;
this->MBFI = MBFI;
@@ -240,12 +248,18 @@ void SpillPlacement::run(MachineFunction &mf, EdgeBundles *Bundles,
TodoList.clear();
TodoList.setUniverse(bundles->getNumBundles());
+ const bool HasUniformity = Uniformity && Uniformity->hasUniformity();
+
// Compute total ingoing and outgoing block frequencies for all bundles.
BlockFrequencies.resize(mf.getNumBlockIDs());
setThreshold(MBFI->getEntryFreq());
for (auto &I : mf) {
unsigned Num = I.getNumber();
- BlockFrequencies[Num] = MBFI->getBlockFreq(&I);
+ if (HasUniformity && Uniformity->isDivergent(I)) {
+ BlockFrequencies[Num] = MBFI->getEntryFreq();
+ } else {
+ BlockFrequencies[Num] = MBFI->getBlockFreq(&I);
+ }
}
}
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 8bb78c8c7df63..d541062b743d2 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -135,6 +135,7 @@
#include "llvm/CodeGen/MachineTraceMetrics.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/CodeGen/MachineVerifier.h"
+#include "llvm/CodeGen/OffloadBlockUniformity.h"
#include "llvm/CodeGen/OptimizePHIs.h"
#include "llvm/CodeGen/PEI.h"
#include "llvm/CodeGen/PHIElimination.h"
>From 05e1805c3ce635148a3b7fe94d71044fc3cf4960 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Wed, 28 Jan 2026 08:52:48 -0500
Subject: [PATCH 2/3] [CodeGen] Fix BitVector::set bug and add test for
OffloadBlockUniformity
Fix incorrect BitVector::set() usage: set(I, E) sets a range of bits,
not a single bit with a value. When IsDivergent=false, this caused
"Attempted to set backwards range!" assertion failure.
Also add codegen test for the OffloadBlockUniformityAnalysis that
verifies the analysis correctly reads offload-block-uniformity metadata.
---
.../llvm/CodeGen/OffloadBlockUniformity.h | 3 +-
llvm/lib/CodeGen/OffloadBlockUniformity.cpp | 4 +-
.../AMDGPU/offload-block-uniformity.ll | 98 +++++++++++++++++++
3 files changed, 102 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll
diff --git a/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h b/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
index 31e57758a23a0..66821c1f61167 100644
--- a/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
+++ b/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
@@ -57,7 +57,8 @@ class OffloadBlockUniformityAnalysis
public:
using Result = OffloadBlockUniformityInfo;
- LLVM_ABI Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM);
+ LLVM_ABI Result run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
};
} // end namespace llvm
diff --git a/llvm/lib/CodeGen/OffloadBlockUniformity.cpp b/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
index 13d772c08d92c..235e40a6e3982 100644
--- a/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
+++ b/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
@@ -58,8 +58,8 @@ void OffloadBlockUniformityInfo::compute(const MachineFunction &MF) {
if (auto U = getIRBlockUniformity(*BB))
IsDivergent = !*U;
}
- if (Num < DivergentBlocks.size())
- DivergentBlocks.set(Num, IsDivergent);
+ if (Num < DivergentBlocks.size() && IsDivergent)
+ DivergentBlocks.set(Num);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll b/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll
new file mode 100644
index 0000000000000..60a1864020251
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll
@@ -0,0 +1,98 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=greedy < %s | FileCheck %s
+
+; Test that OffloadBlockUniformityAnalysis correctly reads offload-block-uniformity
+; metadata from IR basic blocks and propagates uniformity information to codegen.
+;
+; This metadata is attached during PGO-use phase to indicate whether a basic block
+; was executed uniformly (all lanes together) or divergently (partial wave).
+;
+; The analysis is consumed by SpillPlacement to flatten block frequencies for
+; divergent blocks, preventing PGO from causing regressions on divergent code paths.
+
+; CHECK-LABEL: name: uniform_blocks
+; CHECK: body:
+define amdgpu_kernel void @uniform_blocks(ptr addrspace(1) %out, i32 %cond) #0 {
+entry:
+ %cmp = icmp sgt i32 %cond, 0
+ ; Entry block is uniform - all threads enter together
+ br i1 %cmp, label %if.then, label %if.end, !offload-block-uniformity !0
+
+if.then:
+ ; This block is uniform - all threads that enter, enter together
+ store i32 1, ptr addrspace(1) %out, align 4
+ br label %if.end, !offload-block-uniformity !0
+
+if.end:
+ ; Exit block is uniform
+ ret void, !offload-block-uniformity !0
+}
+
+; CHECK-LABEL: name: divergent_blocks
+; CHECK: body:
+define amdgpu_kernel void @divergent_blocks(ptr addrspace(1) %out, i32 %tid) #0 {
+entry:
+ %cmp = icmp eq i32 %tid, 0
+ ; Entry is uniform, but branch target is divergent
+ br i1 %cmp, label %if.then, label %if.end, !offload-block-uniformity !0
+
+if.then:
+ ; This block is divergent - only some threads enter
+ store i32 1, ptr addrspace(1) %out, align 4
+ br label %if.end, !offload-block-uniformity !1
+
+if.end:
+ ; Exit block is uniform (reconverged)
+ ret void, !offload-block-uniformity !0
+}
+
+; CHECK-LABEL: name: mixed_uniformity
+; CHECK: body:
+define amdgpu_kernel void @mixed_uniformity(ptr addrspace(1) %out, i32 %cond, i32 %tid) #0 {
+entry:
+ %cmp1 = icmp sgt i32 %cond, 0
+ br i1 %cmp1, label %outer.then, label %exit, !offload-block-uniformity !0
+
+outer.then:
+ ; Uniform outer branch
+ %cmp2 = icmp eq i32 %tid, 0
+ br i1 %cmp2, label %inner.then, label %inner.else, !offload-block-uniformity !0
+
+inner.then:
+ ; Divergent inner branch - only lane 0
+ store i32 1, ptr addrspace(1) %out, align 4
+ br label %merge, !offload-block-uniformity !1
+
+inner.else:
+ ; Divergent - other lanes
+ store i32 2, ptr addrspace(1) %out, align 4
+ br label %merge, !offload-block-uniformity !1
+
+merge:
+ ; Reconverged - uniform again
+ br label %exit, !offload-block-uniformity !0
+
+exit:
+ ret void, !offload-block-uniformity !0
+}
+
+; CHECK-LABEL: name: no_uniformity_metadata
+; CHECK: body:
+define amdgpu_kernel void @no_uniformity_metadata(ptr addrspace(1) %out, i32 %cond) #0 {
+entry:
+ ; No uniformity metadata - analysis should report hasUniformity() = false
+ %cmp = icmp sgt i32 %cond, 0
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ store i32 1, ptr addrspace(1) %out, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+
+; Metadata: i1 true = uniform, i1 false = divergent
+!0 = !{i1 true} ; uniform
+!1 = !{i1 false} ; divergent
>From 80ce6f6a53b38ee50d7c69df8f119cf3f4e9ac18 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Wed, 28 Jan 2026 10:03:22 -0500
Subject: [PATCH 3/3] [CodeGen] Add offload uniformity printer and stronger
test
Expose offload block uniformity classification via a printer pass so
codegen tests can assert uniform, divergent, and missing-metadata cases.
Update the AMDGPU test to validate the printed classifications.
---
.../llvm/CodeGen/OffloadBlockUniformity.h | 15 ++++
.../llvm/Passes/MachinePassRegistry.def | 2 +
llvm/lib/CodeGen/OffloadBlockUniformity.cpp | 33 ++++++++
.../AMDGPU/offload-block-uniformity.ll | 81 +++++++------------
4 files changed, 81 insertions(+), 50 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h b/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
index 66821c1f61167..148c38443c35d 100644
--- a/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
+++ b/llvm/include/llvm/CodeGen/OffloadBlockUniformity.h
@@ -25,12 +25,14 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineFunctionAnalysis.h"
#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
+#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/Support/Compiler.h"
namespace llvm {
class MachineBasicBlock;
class MachineFunction;
+class raw_ostream;
class OffloadBlockUniformityInfo {
public:
@@ -45,6 +47,8 @@ class OffloadBlockUniformityInfo {
// divergent (conservative).
LLVM_ABI bool isDivergent(const MachineBasicBlock &MBB) const;
+ LLVM_ABI void print(raw_ostream &OS, const MachineFunction &MF) const;
+
private:
bool HasAnyUniformity = false;
BitVector DivergentBlocks;
@@ -61,6 +65,17 @@ class OffloadBlockUniformityAnalysis
MachineFunctionAnalysisManager &MFAM);
};
+class OffloadBlockUniformityPrinterPass
+ : public PassInfoMixin<OffloadBlockUniformityPrinterPass> {
+ raw_ostream &OS;
+
+public:
+ explicit OffloadBlockUniformityPrinterPass(raw_ostream &OS) : OS(OS) {}
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+ static bool isRequired() { return true; }
+};
+
} // end namespace llvm
#endif // LLVM_CODEGEN_OFFLOADBLOCKUNIFORMITY_H
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index ed56bd6acd3fc..48d859abfa3b2 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -154,6 +154,8 @@ MACHINE_FUNCTION_PASS("print<machine-post-dom-tree>",
MachinePostDominatorTreePrinterPass(errs()))
MACHINE_FUNCTION_PASS("print<machine-uniformity>",
MachineUniformityPrinterPass(errs()))
+MACHINE_FUNCTION_PASS("print<offload-block-uniformity>",
+ OffloadBlockUniformityPrinterPass(errs()))
MACHINE_FUNCTION_PASS("print<reaching-def>", ReachingDefPrinterPass(errs()))
MACHINE_FUNCTION_PASS("print<slot-indexes>", SlotIndexesPrinterPass(errs()))
MACHINE_FUNCTION_PASS("print<virtregmap>", VirtRegMapPrinterPass(errs()))
diff --git a/llvm/lib/CodeGen/OffloadBlockUniformity.cpp b/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
index 235e40a6e3982..868ff3d20dfe7 100644
--- a/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
+++ b/llvm/lib/CodeGen/OffloadBlockUniformity.cpp
@@ -13,6 +13,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/Support/raw_ostream.h"
#include <optional>
using namespace llvm;
@@ -63,6 +64,30 @@ void OffloadBlockUniformityInfo::compute(const MachineFunction &MF) {
}
}
+void OffloadBlockUniformityInfo::print(raw_ostream &OS,
+ const MachineFunction &MF) const {
+ OS << "OffloadBlockUniformityInfo for function: ";
+ MF.getFunction().printAsOperand(OS, /*PrintType=*/false);
+ OS << '\n';
+ OS << "HasUniformity: " << (HasAnyUniformity ? "true" : "false") << '\n';
+ if (!HasAnyUniformity)
+ return;
+
+ for (const MachineBasicBlock &MBB : MF) {
+ const BasicBlock *BB = MBB.getBasicBlock();
+ if (!BB)
+ continue;
+ OS << " BLOCK bb." << MBB.getNumber();
+ if (BB->hasName())
+ OS << " (%" << BB->getName() << ")";
+ if (auto U = getIRBlockUniformity(*BB)) {
+ OS << ": " << (*U ? "uniform" : "divergent") << '\n';
+ continue;
+ }
+ OS << ": no-metadata (treated divergent)\n";
+ }
+}
+
bool OffloadBlockUniformityInfo::isDivergent(
const MachineBasicBlock &MBB) const {
if (!HasAnyUniformity)
@@ -82,3 +107,11 @@ OffloadBlockUniformityAnalysis::run(MachineFunction &MF,
Info.compute(MF);
return Info;
}
+
+PreservedAnalyses
+OffloadBlockUniformityPrinterPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ auto &Info = MFAM.getResult<OffloadBlockUniformityAnalysis>(MF);
+ Info.print(OS, MF);
+ return PreservedAnalyses::all();
+}
diff --git a/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll b/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll
index 60a1864020251..0014c65062ef0 100644
--- a/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll
+++ b/llvm/test/CodeGen/AMDGPU/offload-block-uniformity.ll
@@ -1,7 +1,8 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=greedy < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -stop-after=finalize-isel -o - %s | \
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -passes='print<offload-block-uniformity>' -x mir -filetype=null 2>&1 | FileCheck %s
; Test that OffloadBlockUniformityAnalysis correctly reads offload-block-uniformity
-; metadata from IR basic blocks and propagates uniformity information to codegen.
+; metadata from IR basic blocks and classifies machine blocks.
;
; This metadata is attached during PGO-use phase to indicate whether a basic block
; was executed uniformly (all lanes together) or divergently (partial wave).
@@ -9,85 +10,65 @@
; The analysis is consumed by SpillPlacement to flatten block frequencies for
; divergent blocks, preventing PGO from causing regressions on divergent code paths.
-; CHECK-LABEL: name: uniform_blocks
-; CHECK: body:
-define amdgpu_kernel void @uniform_blocks(ptr addrspace(1) %out, i32 %cond) #0 {
+; CHECK-LABEL: OffloadBlockUniformityInfo for function: @uniform_blocks
+; CHECK-NEXT: HasUniformity: true
+; CHECK: BLOCK bb.{{[0-9]+}} (%entry): uniform
+define amdgpu_kernel void @uniform_blocks(ptr addrspace(1) %out) #0 {
entry:
- %cmp = icmp sgt i32 %cond, 0
- ; Entry block is uniform - all threads enter together
- br i1 %cmp, label %if.then, label %if.end, !offload-block-uniformity !0
-
-if.then:
- ; This block is uniform - all threads that enter, enter together
store i32 1, ptr addrspace(1) %out, align 4
- br label %if.end, !offload-block-uniformity !0
-
-if.end:
- ; Exit block is uniform
ret void, !offload-block-uniformity !0
}
-; CHECK-LABEL: name: divergent_blocks
-; CHECK: body:
+; CHECK-LABEL: OffloadBlockUniformityInfo for function: @divergent_blocks
+; CHECK-NEXT: HasUniformity: true
+; CHECK-DAG: BLOCK bb.{{[0-9]+}} (%if.then): divergent
+; CHECK-DAG: BLOCK bb.{{[0-9]+}} (%if.else): uniform
define amdgpu_kernel void @divergent_blocks(ptr addrspace(1) %out, i32 %tid) #0 {
entry:
%cmp = icmp eq i32 %tid, 0
- ; Entry is uniform, but branch target is divergent
- br i1 %cmp, label %if.then, label %if.end, !offload-block-uniformity !0
+ br i1 %cmp, label %if.then, label %if.else
if.then:
- ; This block is divergent - only some threads enter
store i32 1, ptr addrspace(1) %out, align 4
- br label %if.end, !offload-block-uniformity !1
+ ret void, !offload-block-uniformity !1
-if.end:
- ; Exit block is uniform (reconverged)
+if.else:
+ store i32 2, ptr addrspace(1) %out, align 4
ret void, !offload-block-uniformity !0
}
-; CHECK-LABEL: name: mixed_uniformity
-; CHECK: body:
-define amdgpu_kernel void @mixed_uniformity(ptr addrspace(1) %out, i32 %cond, i32 %tid) #0 {
+; CHECK-LABEL: OffloadBlockUniformityInfo for function: @missing_metadata
+; CHECK-NEXT: HasUniformity: true
+; CHECK-DAG: BLOCK bb.{{[0-9]+}} (%if.then): no-metadata (treated divergent)
+; CHECK-DAG: BLOCK bb.{{[0-9]+}} (%if.else): uniform
+define amdgpu_kernel void @missing_metadata(ptr addrspace(1) %out, i32 %cond) #0 {
entry:
- %cmp1 = icmp sgt i32 %cond, 0
- br i1 %cmp1, label %outer.then, label %exit, !offload-block-uniformity !0
-
-outer.then:
- ; Uniform outer branch
- %cmp2 = icmp eq i32 %tid, 0
- br i1 %cmp2, label %inner.then, label %inner.else, !offload-block-uniformity !0
+ %cmp = icmp sgt i32 %cond, 0
+ br i1 %cmp, label %if.then, label %if.else
-inner.then:
- ; Divergent inner branch - only lane 0
+if.then:
store i32 1, ptr addrspace(1) %out, align 4
- br label %merge, !offload-block-uniformity !1
+ ret void
-inner.else:
- ; Divergent - other lanes
+if.else:
store i32 2, ptr addrspace(1) %out, align 4
- br label %merge, !offload-block-uniformity !1
-
-merge:
- ; Reconverged - uniform again
- br label %exit, !offload-block-uniformity !0
-
-exit:
ret void, !offload-block-uniformity !0
}
-; CHECK-LABEL: name: no_uniformity_metadata
-; CHECK: body:
+; CHECK-LABEL: OffloadBlockUniformityInfo for function: @no_uniformity_metadata
+; CHECK-NEXT: HasUniformity: false
define amdgpu_kernel void @no_uniformity_metadata(ptr addrspace(1) %out, i32 %cond) #0 {
entry:
; No uniformity metadata - analysis should report hasUniformity() = false
%cmp = icmp sgt i32 %cond, 0
- br i1 %cmp, label %if.then, label %if.end
+ br i1 %cmp, label %if.then, label %if.else
if.then:
store i32 1, ptr addrspace(1) %out, align 4
- br label %if.end
+ ret void
-if.end:
+if.else:
+ store i32 2, ptr addrspace(1) %out, align 4
ret void
}
More information about the llvm-commits
mailing list