[llvm] [AMDGPU] Automatic conversion from wave32 to wave64 (PR #137376)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 25 12:14:21 PDT 2025
================
@@ -0,0 +1,321 @@
+//===- SIConvertWaveSize.cpp - Automatically converts wave32 kernels to wave64
+//---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 WITH LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+// Small short living kernels may become waveslot limited.
+// To work around the problem an optimization is proposed to convert such
+// kernels from wave32 to wave64 automatically.These kernels shall conform to a
+// strict set of limitations and satisfy profitability conditions.
+//
+// 1. A kernel shall have no function calls as we cannot analyze call stack
+// requirements (nor will it fall into a category of short living kernels
+// anyway).
+// 2. A kernel itself shall not be called from a device enqueue call.
+// 3. A kernel shall not attempt to access EXEC or VCC in any user visible
+// way.
+// 4. A kernel must not use readlane/readfirstlane or any cross-lane/DPP
+// operations in general.
+// 5. A kernel shall not read wavefront size or use ballot through
+// intrinsics (a use of pre-defined frontend wave size macro was deemed
+// permissible for now).
+// 6. There shall be no atomic operations of any sort as these may be used
+// for cross-thread communication.
+// 7. There shall be no LDS access as the allocation is usually tied to the
+// workgroup size and we generally cannot extend it. It is also changing
+// occupancy which is tied to the wave size.
+// 8. There shall be no inline asm calls.
+// 9 .There shall be no dynamic VGPRs.
+// 10 .Starting from GFX11 some instructions (such as WMMA on GFX11+ and
+// transpose loads on GFX12+) work differently (have different operands) in
+// wave32 and wave64. The kernel shall not have intrinsics to invoke such
+// instructions.
+
+#include "SIConvertWaveSize.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-convert-wave-size"
+
+namespace {
+class SIConvertWaveSize {
+ const TargetMachine *TM;
+ const LoopInfo *LI;
+ ScalarEvolution *SE;
+ TargetTransformInfo *TTI;
+
+ InstructionCost TotalCost = 0;
+
+ static const unsigned MaxLatency = 2000;
+
+ SmallVector<Function *> Callees;
+
+public:
+ SIConvertWaveSize(const TargetMachine *TM, const LoopInfo *LI,
+ ScalarEvolution *SE, TargetTransformInfo *TTI)
+ : TM(TM), LI(LI), SE(SE), TTI(TTI) {}
+
+ bool run(Function &F);
+
+ bool changeWaveSizeAttr(Function *F);
+};
+
+class SIConvertWaveSizeLegacy : public FunctionPass {
+ const TargetMachine *TM;
+
+public:
+ static char ID;
+ SIConvertWaveSizeLegacy(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {}
+ bool runOnFunction(Function &F) override {
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ SIConvertWaveSize Impl(TM, &LI, &SE, &TTI);
+ return Impl.run(F);
+ }
+ StringRef getPassName() const override { return "SI convert wave size"; }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.setPreservesAll();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+};
+} // end anonymous namespace
+
+void printFunctionAttributes(const Function &F) {
+ LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
+ for (const auto &Attr : F.getAttributes()) {
+ LLVM_DEBUG(dbgs() << " Attribute: " << Attr.getAsString() << "\n");
+ }
+}
+
+bool SIConvertWaveSize::run(Function &F) {
+ LLVM_DEBUG(dbgs() << "Running SIConvertWaveSize on function: " << F.getName() << "\n");
+ LLVM_DEBUG(printFunctionAttributes(F));
+
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ if (ST.getGeneration() < AMDGPUSubtarget::GFX11)
+ return false;
+
+ // Check if the function is a kernel.
+ if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+ return false;
+
+ // Check if the kernel is wave32
+ if (F.hasFnAttribute("target-features")) {
+ if (!F.getFnAttribute("target-features")
+ .getValueAsString().contains("wavefrontsize32")) {
+ LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Kernel is not wave32.\n");
+ return false;
+ }
+ }
+
+ // Check if the function is a device enqueue call.
+ if (F.hasFnAttribute("amdgpu-device-enqueue")) {
+ LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Device enqueue call detected.\n");
+ return false;
+ }
+
+ // Check if a trip count is a compile time constant for all loops in the
+ // kernel
+ for (Loop *L : *LI) {
+ const SCEV *TripCountSCEV = SE->getBackedgeTakenCount(L);
+ if (!isa<SCEVConstant>(TripCountSCEV)) {
+ LLVM_DEBUG(
+ dbgs() << "SIConvertWaveSize: Trip count is not a compile time "
+ "constant.\n");
+ return false;
+ }
+ }
+
+ for (const auto &BB : F) {
+ InstructionCost BlockCost = 0;
+ for (const auto &I : BB) {
+ if (const CallBase *CB = dyn_cast<CallBase>(&I)) {
+ // FIXME: Any calls are not allowed. Only non-converged intrinsic clls
+ // and amdgsn_s_barrier are exempt. InlineAsm and Atomics are checkedd
+ // separately for debug purposes. This will be changed in the final
+ // version.
+ if (CB->isInlineAsm()) {
+ // Inline assembly is not allowed.
+ LLVM_DEBUG(dbgs()
+ << "SIConvertWaveSize: Inline assembly detected.\n");
+ return false;
+ }
+ if (CB->isAtomic()) {
+ // Atomic operations are not allowed.
+ LLVM_DEBUG(dbgs()
+ << "SIConvertWaveSize: Atomic operation detected.\n");
+ return false;
+ }
+ if (Function *Callee = CB->getCalledFunction()) {
+ // assuming readlane/readfirstlane or any cross-lane/DPP
+ // operations have "let isConvergent = 1" in IntrinsicsAMDGPU.td
+ if (Callee->isIntrinsic()) {
+ if (Callee->hasFnAttribute(Attribute::Convergent)) {
+ if (Callee->getIntrinsicID() != Intrinsic::amdgcn_s_barrier) {
+ // TODO: what else should go in a "white list" ?
+ // Intrinsic::amdgcn_s_barrier_wavefront ?
+ // Intrinsic::amdgcn_s_barrier_signal ?
+ LLVM_DEBUG(dbgs()
+ << "SIConvertWaveSize: Convergent intrinsic "
+ << Callee->getName() << " detected.\n");
+ return false;
+ }
+ }
+
+ if (Callee->getIntrinsicID() == Intrinsic::read_register) {
+ if (const auto *MDVal =
+ dyn_cast<MetadataAsValue>(CB->getArgOperand(0))) {
+ Metadata *MD = MDVal->getMetadata();
+ if (auto *MDNodeVal = dyn_cast<MDNode>(MD)) {
+ if (MDNodeVal->getNumOperands() >= 1) {
+ if (auto *MDStr =
+ dyn_cast<MDString>(MDNodeVal->getOperand(0))) {
+ if (MDStr->getString().starts_with("exec") ||
+ MDStr->getString().starts_with("vcc")) {
+ LLVM_DEBUG(dbgs() << "SIConvertWaveSize: read_register("
+ << MDStr->getString()
+ << ") intrinsic detected.\n");
+ return false;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Save callee as a candidate for attribute change
+ Callees.push_back(Callee);
+ }
+ } else {
+ // General calls are not allowed.
+ LLVM_DEBUG(dbgs() << "SIConvertWaveSize: function call detected.\n");
+ return false;
+ }
+ }
+ // No LDS access is allowed
+ if (auto LI = dyn_cast<LoadInst>(&I)) {
----------------
rampitec wrote:
Load or store to local address is not a sufficient check. LDS may be addressed by flat. The kernel shall have no references to LDS objects.
https://github.com/llvm/llvm-project/pull/137376
More information about the llvm-commits
mailing list