[llvm] [AMDGPU] Add option to prevent insns straddling half cache-line boundaries (PR #150239)

Wed Jul 23 08:49:09 PDT 2025

================
@@ -274,12 +274,72 @@ static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII,
   OS.emitRawComment(" transferring at most " + TransferredRegs);
 }
 
+extern cl::opt<bool> PreventHalfCacheLineStraddling;
+
+static unsigned getMCInstSizeInBytes(const MCInst &LoweredMCI,
+                                     const GCNSubtarget &STI,
+                                     MCContext &OutContext) {
+  SmallVector<MCFixup, 4> Fixups;
+  SmallVector<char, 16> CodeBytes;
+
+  std::unique_ptr<MCCodeEmitter> InstEmitter(
+      createAMDGPUMCCodeEmitter(*STI.getInstrInfo(), OutContext));
+  InstEmitter->encodeInstruction(LoweredMCI, CodeBytes, Fixups, STI);
+  return (CodeBytes.size());
+};
+
 void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
   // FIXME: Enable feature predicate checks once all the test pass.
   // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
   //                                        getSubtargetInfo().getFeatureBits());
 
+  auto AvoidHalfCacheLineBoundary = [this](const MachineInstr *MI,
+                                           const MachineFunction *MF,
+                                           const MCInst &LoweredMCI) -> void {
+    const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
+    SIMachineFunctionInfo *MFI = const_cast<SIMachineFunctionInfo *>(
+        MF->getInfo<SIMachineFunctionInfo>());
+
+    unsigned InstSizeInBytes = STI.getInstrInfo()->getInstSizeInBytes(*MI);
+
+    // getInstrSizeInBytes convervatively overestimates the size of branches due
+    // to a NOP added for the 0x3f offset bug.  Any inaccuracies in instruction
+    // sizes will cause problems when avoiding straddling half cache-line
+    // boundaries. A NOP is usually not added so remove the +4 that was added.
+    if (MI->isBranch() && STI.hasOffset3fBug())
+      InstSizeInBytes -= 4;
+    // Rarely, some MachineInstr do not have accurate instruction sizes.  Try to
+    // calculate the size from the lowered MCInst.
+    else if (InstSizeInBytes == 0 && STI.isCPUStringValid(STI.getCPU()) &&
+             !(MI->getOpcode() == AMDGPU::SI_ILLEGAL_COPY ||
+               MI->getOpcode() == AMDGPU::ATOMIC_FENCE))
+      InstSizeInBytes = getMCInstSizeInBytes(LoweredMCI, STI, OutContext);
+
+    // FIXME: Workaround bug in V_MADMK_F32 size.
+    if (MI->getOpcode() == AMDGPU::V_MADMK_F32)
+      InstSizeInBytes = 8;
----------------
arsenm wrote:

Directly fix this 

https://github.com/llvm/llvm-project/pull/150239