[llvm] [AMDGPU][NFCI] Remove preload kernarg alloc dep on DAG isel path (PR #96030)

Fri Jun 21 13:21:00 PDT 2024

================
@@ -2465,71 +2465,94 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
   // these from the dispatch pointer.
 }
 
+static bool allocPreloadKernArg(uint64_t &LastExplicitArgOffset,
+                                uint64_t ExplicitArgOffset, uint64_t ArgOffset,
+                                unsigned ArgSize, unsigned Idx,
+                                MachineFunction &MF, const SIRegisterInfo &TRI,
+                                SIMachineFunctionInfo &Info, CCState &CCInfo) {
+  if (ArgOffset >= ExplicitArgOffset)
+    return false;
+
+  const Align KernelArgBaseAlign = Align(16);
+  Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
+  unsigned NumAllocSGPRs = alignTo(ArgSize, 4) / 4;
+
+  // Arg is preloaded into the previous SGPR.
+  if (ArgSize < 4 && Alignment < 4) {
+    Info.getArgInfo().PreloadKernArgs[Idx].Regs.push_back(
+        Info.getArgInfo().PreloadKernArgs[Idx - 1].Regs[0]);
+    return true;
+  }
+
+  unsigned Padding = ArgOffset - LastExplicitArgOffset;
+  unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
+  // Check for free user SGPRs for preloading.
+  if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
+      Info.getUserSGPRInfo().getNumFreeUserSGPRs()) {
+    return false;
+  }
+
+  // Preload this argument.
+  const TargetRegisterClass *RC =
+      TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+  SmallVectorImpl<MCRegister> *PreloadRegs =
+      Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, Idx, PaddingSGPRs);
+
+  if (PreloadRegs->size() > 1)
+    RC = &AMDGPU::SGPR_32RegClass;
+
+  for (auto &Reg : *PreloadRegs) {
----------------
arsenm wrote:

No auto, no reference 

https://github.com/llvm/llvm-project/pull/96030