[llvm] [AMDGPU] Eliminate likely-spurious execz checks (PR #117567)
Fabian Ritter via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 28 01:12:17 PST 2024
================
@@ -0,0 +1,201 @@
+//===- AMDGPUAnnotateVaryingBranchWeights.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Estimate if conditional branches for which SIAnnotateControlFlow introduced
+// amdgcn_if or amdgcn_else intrinsics are likely to have different outcomes for
+// the threads of each wavefront. If that is the case, BranchWeight metadata is
+// added to signal that "then" and "else" blocks are both likely to be executed.
+// This may introduce branch weights that would be self-contradictory in a
+// non-SIMT setting.
+//
+// A consequence of this is that SIPreEmitPeephole is more likely to eliminate
+// s_cbranch_execz instructions that were introduced to skip these blocks when
+// no thread in the wavefront is active for them.
+//
+// Should only run after SIAnnotateControlFlow.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-annotate-varying-branch-weights"
+
+namespace {
+
+class AMDGPUAnnotateVaryingBranchWeightsImpl {
+public:
+ AMDGPUAnnotateVaryingBranchWeightsImpl() = delete;
+ AMDGPUAnnotateVaryingBranchWeightsImpl(const GCNSubtarget &ST,
+ const TargetTransformInfo &TTI)
+ : ST(ST), TTI(TTI) {
+ // Determine weights that signal that a branch is very likely to be
+ // predicted correctly, i.e., whose ratio exceeds
+ // TTI.getPredictableBranchThreshold().
+ auto BranchProbThreshold = TTI.getPredictableBranchThreshold();
+ LikelyWeight = BranchProbThreshold.getNumerator();
+ UnlikelyWeight = BranchProbThreshold.getDenominator() - LikelyWeight;
+ if (UnlikelyWeight > 0)
+ --UnlikelyWeight;
+ }
+
+ bool run(Function &F);
+
+private:
+ const GCNSubtarget &ST;
+ const TargetTransformInfo &TTI;
+ uint32_t LikelyWeight;
+ uint32_t UnlikelyWeight;
+ ValueMap<const Value *, bool> LikelyVaryingCache;
+
+ /// Heuristically check if it is likely that a wavefront has dynamically
+ /// varying values for V.
+ bool isLikelyVarying(const Value *V);
+
+ /// Set branch weights that signal that the "true" successor of Term is the
+ /// likely destination, if no prior weights are present.
+ /// Return true if weights were set.
+ bool setTrueSuccessorLikely(BranchInst *Term);
+};
+
+class AMDGPUAnnotateVaryingBranchWeightsLegacy : public FunctionPass {
+public:
+ static char ID;
+ AMDGPUAnnotateVaryingBranchWeightsLegacy() : FunctionPass(ID) {
+ initializeAMDGPUAnnotateVaryingBranchWeightsLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "AMDGPU Annotate Varying Branch Weights";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ }
+
+ bool runOnFunction(Function &F) override {
+ TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+ const TargetMachine &TM = TPC.getTM<TargetMachine>();
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
+ return AMDGPUAnnotateVaryingBranchWeightsImpl(ST, TTI).run(F);
+ }
+};
+
+} // end anonymous namespace
+
+char AMDGPUAnnotateVaryingBranchWeightsLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateVaryingBranchWeightsLegacy, DEBUG_TYPE,
+ "Annotate Varying Branch Weights", false, false)
+INITIALIZE_PASS_END(AMDGPUAnnotateVaryingBranchWeightsLegacy, DEBUG_TYPE,
+ "Annotate Varying Branch Weights", false, false)
+
+FunctionPass *llvm::createAMDGPUAnnotateVaryingBranchWeightsLegacyPass() {
+ return new AMDGPUAnnotateVaryingBranchWeightsLegacy();
+}
+
+PreservedAnalyses
+AMDGPUAnnotateVaryingBranchWeightsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
+ bool Changed = AMDGPUAnnotateVaryingBranchWeightsImpl(ST, TTI).run(F);
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+bool AMDGPUAnnotateVaryingBranchWeightsImpl::isLikelyVarying(const Value *V) {
+ // Check if V is a source of divergence or if it transitively uses one.
+ if (TTI.isSourceOfDivergence(V))
+ return true;
+
+ auto *U = dyn_cast<User>(V);
+ if (!U)
+ return false;
+
+ // Have we already checked V?
+ auto CacheEntry = LikelyVaryingCache.find(V);
+ if (CacheEntry != LikelyVaryingCache.end())
+ return CacheEntry->second;
+
+ // Does it use a likely varying Value?
+ bool Result = false;
+ for (const auto &Use : U->operands()) {
+ Result |= isLikelyVarying(Use);
+ if (Result)
+ break;
+ }
+
+ LikelyVaryingCache.insert({V, Result});
+ return Result;
+}
+
+bool AMDGPUAnnotateVaryingBranchWeightsImpl::setTrueSuccessorLikely(
+ BranchInst *Term) {
+ assert(Term->isConditional());
+
+ // Don't overwrite existing branch weights.
+ if (hasProfMD(*Term))
+ return false;
+
+ llvm::setBranchWeights(*Term, {LikelyWeight, UnlikelyWeight}, false);
+ LLVM_DEBUG(dbgs() << "Added branch weights: " << *Term << '\n');
+ return true;
+}
+
+bool AMDGPUAnnotateVaryingBranchWeightsImpl::run(Function &F) {
+ // If the workgroup has only a single thread, the condition cannot vary.
+ const auto WGSizes = ST.getFlatWorkGroupSizes(F);
+ if (WGSizes.first <= 1)
----------------
ritter-x2a wrote:
> Can you use `ST.isSingleLaneExecution` instead?
Returning early if `ST.isSingleLaneExecution(F)` is `false` would mean annotating more cases: The intention of the current version was to annotate if it's known that single lane execution cannot happen for `F`. The changed variant would additionally annotate cases where we don't know if single lane execution could happen.
I'd be open to that change; it seems likely that kernels without explicit work group bounds and for which performance matters use more than a single lane.
https://github.com/llvm/llvm-project/pull/117567
More information about the llvm-commits
mailing list