[llvm] [AMDGPU] Add backward compatibility layer for kernarg preloading (PR #119167)

Sun Dec 29 22:21:15 PST 2024

================
@@ -0,0 +1,217 @@
+//===- AMDGPUPreloadKernArgProlog.cpp - Preload KernArg Prolog ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass creates a backward compatibility layer for kernel argument
+/// preloading in situations where code is compiled with kernel argument
+/// preloading enabled but executed on hardware without firmware support for it.
+///
+/// To avoid recompilation, the pass inserts a block at the beginning of the
+/// program that loads the kernel arguments into SGPRs using s_load
+/// instructions. This sets up the registers exactly as they would be on systems
+/// with compatible firmware.
+///
+/// This effectively creates two entry points for the kernel. Firmware that
+/// supports the feature will automatically jump past the first 256 bytes of the
+/// program, skipping the compatibility layer and directly starting execution on
+/// the optimized code path.
+///
+/// This pass should be run as late as possible to prevent any optimizations
+/// that might assume the padding is dead code or that the added prologue is a
+/// true predecessor of the kernel entry block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUPreloadKernArgProlog.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/TargetParser/TargetParser.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-preload-kern-arg-prolog"
+
+namespace {
+
+// Used to build s_loads maping user SGPRs to kernel arguments
+struct LoadConfig {
+  unsigned Size;
+  const TargetRegisterClass *RegClass;
+  unsigned Opcode;
+  Register LoadReg;
+
+  // Constructor for the static config array
+  constexpr LoadConfig(unsigned S, const TargetRegisterClass *RC, unsigned Op)
+      : Size(S), RegClass(RC), Opcode(Op), LoadReg(AMDGPU::NoRegister) {}
+
+  // Constructor for the return value
+  constexpr LoadConfig(unsigned S, const TargetRegisterClass *RC, unsigned Op,
+                       Register Reg)
+      : Size(S), RegClass(RC), Opcode(Op), LoadReg(Reg) {}
+};
+
+class AMDGPUPreloadKernArgProlog {
+public:
+  AMDGPUPreloadKernArgProlog(MachineFunction &MF);
+
+  bool run();
+
+private:
+  MachineFunction &MF;
+  const GCNSubtarget &ST;
+  const SIMachineFunctionInfo &MFI;
+  const SIInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+
+  // Create a new block before the entry point to the kernel. Firmware that
+  // supports preloading kernel arguments will automatically jump past this
+  // block to the alternative kernel entry point.
+  void createBackCompatBlock(unsigned NumKernArgPreloadSGPRs);
+
+  // Add instructions to load kernel arguments into SGPRs.
+  void addBackCompatLoads(MachineBasicBlock *BackCompatMBB,
+                          Register KernArgSegmentPtr,
+                          unsigned NumKernArgPreloadSGPRs);
+};
+
+class AMDGPUPreloadKernArgPrologLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUPreloadKernArgPrologLegacy() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "AMDGPU Preload Kernel Arguments Prolog";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end anonymous namespace
+
+char AMDGPUPreloadKernArgPrologLegacy::ID = 0;
+
+INITIALIZE_PASS(AMDGPUPreloadKernArgPrologLegacy, DEBUG_TYPE,
+                "AMDGPU Preload Kernel Arguments Prolog", false, false)
+
+char &llvm::AMDGPUPreloadKernArgPrologLegacyID =
+    AMDGPUPreloadKernArgPrologLegacy::ID;
+
+FunctionPass *llvm::createAMDGPUPreloadKernArgPrologLegacyPass() {
+  return new AMDGPUPreloadKernArgPrologLegacy();
+}
+
+bool AMDGPUPreloadKernArgPrologLegacy::runOnMachineFunction(
+    MachineFunction &MF) {
+  return AMDGPUPreloadKernArgProlog(MF).run();
+}
+
+AMDGPUPreloadKernArgProlog::AMDGPUPreloadKernArgProlog(MachineFunction &MF)
+    : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
+      MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(*ST.getInstrInfo()),
+      TRI(*ST.getRegisterInfo()) {}
+
+bool AMDGPUPreloadKernArgProlog::run() {
+  if (!ST.hasKernargPreload())
+    return false;
+
+  unsigned NumKernArgPreloadSGPRs = MFI.getNumKernargPreloadedSGPRs();
+  if (!NumKernArgPreloadSGPRs)
+    return false;
+
+  createBackCompatBlock(NumKernArgPreloadSGPRs);
+  return true;
+}
+
+void AMDGPUPreloadKernArgProlog::createBackCompatBlock(
+    unsigned NumKernArgPreloadSGPRs) {
+  auto KernelEntryMBB = MF.begin();
+  MachineBasicBlock *BackCompatMBB = MF.CreateMachineBasicBlock();
+  MF.insert(KernelEntryMBB, BackCompatMBB);
+
+  assert(MFI.getUserSGPRInfo().hasKernargSegmentPtr() &&
+         "Kernel argument segment pointer register not set.");
+  Register KernArgSegmentPtr = MFI.getArgInfo().KernargSegmentPtr.getRegister();
+  BackCompatMBB->addLiveIn(KernArgSegmentPtr);
+
+  // Load kernel arguments to SGPRs
+  addBackCompatLoads(BackCompatMBB, KernArgSegmentPtr, NumKernArgPreloadSGPRs);
+
+  // Wait for loads to complete
+  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
+  unsigned Waitcnt =
+      AMDGPU::encodeWaitcnt(IV, getVmcntBitMask(IV), getExpcntBitMask(IV), 0);
+  BuildMI(BackCompatMBB, DebugLoc(), TII.get(AMDGPU::S_WAITCNT))
+      .addImm(Waitcnt);
+
+  // Branch to kernel start
+  BuildMI(BackCompatMBB, DebugLoc(), TII.get(AMDGPU::S_BRANCH))
+      .addMBB(&*KernelEntryMBB);
+  BackCompatMBB->addSuccessor(&*KernelEntryMBB);
+
+  // Create a new basic block for padding to 256 bytes
+  MachineBasicBlock *PadMBB = MF.CreateMachineBasicBlock();
+  MF.insert(++BackCompatMBB->getIterator(), PadMBB);
+  PadMBB->setAlignment(Align(256));
+  PadMBB->addSuccessor(&*KernelEntryMBB);
+}
+
+// Find the largest possible load size that fits with SGRP alignment
+static LoadConfig getLoadParameters(const TargetRegisterInfo &TRI,
+                                    Register KernArgPreloadSGPR,
+                                    unsigned NumKernArgPreloadSGPRs) {
+  static constexpr LoadConfig Configs[] = {
+      {8, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM},
+      {4, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM},
+      {2, &AMDGPU::SReg_64RegClass, AMDGPU::S_LOAD_DWORDX2_IMM}};
+
+  for (const auto &Config : Configs) {
+    if (NumKernArgPreloadSGPRs >= Config.Size) {
+      Register LoadReg = TRI.getMatchingSuperReg(KernArgPreloadSGPR,
+                                                 AMDGPU::sub0, Config.RegClass);
+      if (LoadReg != AMDGPU::NoRegister)
+        return LoadConfig(Config.Size, Config.RegClass, Config.Opcode, LoadReg);
+    }
+  }
+
+  // Fallback to a single register
+  return LoadConfig(1, &AMDGPU::SReg_32RegClass, AMDGPU::S_LOAD_DWORD_IMM,
----------------
kerbowa wrote:

The RC is only used with `getMatchingSuperReg` to find the largest pow(2) SGPR tuple that we can load to based on a range of user SGPRs, so m0 shouldn't be an issue.

I'm trying to load the sequential SGPRx-SGPRy in as few S_LOADs as possible. These will always be in the range SGPR0-SGPR16 or whatever the max number of user SGPRs is.

https://github.com/llvm/llvm-project/pull/119167