[llvm] [AMDGPU] Automatic conversion from wave32 to wave64 (PR #137376)

Mon May 5 10:16:39 PDT 2025

================
@@ -0,0 +1,321 @@
+//===- SIConvertWaveSize.cpp - Automatically converts wave32 kernels to wave64
+//---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 WITH LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+// Small short living kernels may become waveslot limited.
+// To work around the problem an optimization is proposed to convert such
+// kernels from wave32 to wave64 automatically.These kernels shall conform to a
+// strict set of limitations and satisfy profitability conditions.
+//
+// 1. A kernel shall have no function calls as we cannot analyze call stack
+// requirements (nor will it fall into a category of short living kernels
+// anyway).
+// 2. A kernel itself shall not be called from a device enqueue call.
+// 3. A kernel shall not attempt to access EXEC or VCC in any user visible
+// way.
+// 4. A kernel must not use readlane/readfirstlane or any cross-lane/DPP
+// operations in general.
+// 5. A kernel shall not read wavefront size or use ballot through
+// intrinsics (a use of pre-defined frontend wave size macro was deemed
+// permissible for now).
+// 6. There shall be no atomic operations of any sort as these may be used
+// for cross-thread communication.
+// 7. There shall be no LDS access as the allocation is usually tied to the
+// workgroup size and we generally cannot extend it. It is also changing
+// occupancy which is tied to the wave size.
+// 8. There shall be no inline asm calls.
+// 9 .There shall be no dynamic VGPRs.
+// 10 .Starting from GFX11 some instructions (such as WMMA on GFX11+ and
+// transpose loads on GFX12+) work differently (have different operands) in
+// wave32 and wave64. The kernel shall not have intrinsics to invoke such
+// instructions.
+
+#include "SIConvertWaveSize.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-convert-wave-size"
+
+namespace {
+class SIConvertWaveSize {
+  const TargetMachine *TM;
+  const LoopInfo *LI;
+  ScalarEvolution *SE;
+  TargetTransformInfo *TTI;
+
+  InstructionCost TotalCost = 0;
+
+  static const unsigned MaxLatency = 2000;
+
+  SmallVector<Function *> Callees;
+
+public:
+  SIConvertWaveSize(const TargetMachine *TM, const LoopInfo *LI,
+                    ScalarEvolution *SE, TargetTransformInfo *TTI)
+      : TM(TM), LI(LI), SE(SE), TTI(TTI) {}
+
+  bool run(Function &F);
+
+  bool changeWaveSizeAttr(Function *F);
+};
+
+class SIConvertWaveSizeLegacy : public FunctionPass {
+  const TargetMachine *TM;
+
+public:
+  static char ID;
+  SIConvertWaveSizeLegacy(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {}
+  bool runOnFunction(Function &F) override {
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    SIConvertWaveSize Impl(TM, &LI, &SE, &TTI);
+    return Impl.run(F);
+  }
+  StringRef getPassName() const override { return "SI convert wave size"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.setPreservesAll();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+};
+} // end anonymous namespace
+
+void printFunctionAttributes(const Function &F) {
+  LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
+  for (const auto &Attr : F.getAttributes()) {
+    LLVM_DEBUG(dbgs() << "  Attribute: " << Attr.getAsString() << "\n");
+  }
+}
----------------
alex-t wrote:

It was left for the purpose as this is an early draft. I will remove it once the debugging is finished.

https://github.com/llvm/llvm-project/pull/137376