[llvm] [llvm][AsmPrinter] Add an option to print instruction latencies (PR #113243)

Mon Oct 21 17:57:56 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Jon Roelofs (jroelofs)

<details>
<summary>Changes</summary>

... matching what we have in the disassembler.  This isn't turned on by default since several of the scheduling models are not completely accurate, and we don't want to be misleading.

---
Full diff: https://github.com/llvm/llvm-project/pull/113243.diff


2 Files Affected:

- (modified) llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp (+92-2) 
- (added) llvm/test/CodeGen/AArch64/latency.ll (+10) 


``````````diff

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 327e7f7f8a1ed8..015c4cc3d4b721 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -162,6 +162,13 @@ static cl::opt<bool> EmitJumpTableSizesSection(
     cl::desc("Emit a section containing jump table addresses and sizes"),
     cl::Hidden, cl::init(false));
 
+// This isn't turned on by default, since several of the scheduling models are
+// not completely accurate, and we don't want to be misleading.
+static cl::opt<bool> PrintLatency(
+    "asm-print-latency",
+    cl::desc("Print instruction latencies as verbose asm comments."),
+    cl::Hidden, cl::init(false));
+
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
 char AsmPrinter::ID = 0;
@@ -1080,8 +1087,78 @@ void AsmPrinter::emitFunctionEntryLabel() {
   }
 }
 
+/// Gets latency information for \p Inst from the itinerary
+/// scheduling model.
+/// \return The maximum expected latency over all the operands or -1
+/// if no information is available.
+static int getItineraryLatency(const MachineInstr &MI,
+                               const MachineFunction *MF,
+                               const MCSubtargetInfo *STI) {
+  const int NoInformationAvailable = -1;
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  // Check if we have a CPU to get the itinerary information.
+  if (STI->getCPU().empty())
+    return NoInformationAvailable;
+
+  // Get itinerary information.
+  InstrItineraryData IID = STI->getInstrItineraryForCPU(STI->getCPU());
+  // Get the scheduling class of the requested instruction.
+  const MCInstrDesc &Desc = TII->get(MI.getOpcode());
+  unsigned SCClass = Desc.getSchedClass();
+
+  unsigned Latency = 0;
+
+  for (unsigned Idx = 0, IdxEnd = MI.getNumOperands(); Idx != IdxEnd; ++Idx)
+    if (std::optional<unsigned> OperCycle = IID.getOperandCycle(SCClass, Idx))
+      Latency = std::max(Latency, *OperCycle);
+
+  return (int)Latency;
+}
+
+/// Gets latency information for \p Inst.
+/// \return The maximum expected latency over all the definitions or -1
+/// if no information is available.
+static int getLatency(const MachineInstr &MI, const MCSubtargetInfo *STI) {
+  const MCSchedModel SCModel = STI->getSchedModel();
+  const int NoInformationAvailable = -1;
+
+  const MachineFunction *MF = MI.getMF();
+  if (!MF)
+    return NoInformationAvailable;
+
+  // Check if we have a scheduling model for instructions.
+  if (!SCModel.hasInstrSchedModel())
+    // Try to fall back to the itinerary model if the scheduling model doesn't
+    // have a scheduling table.  Note the default does not have a table.
+    return getItineraryLatency(MI, MF, STI);
+
+  // Get the scheduling class of the requested instruction.
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const MCInstrDesc &Desc = TII->get(MI.getOpcode());
+  unsigned SCClass = Desc.getSchedClass();
+  const MCSchedClassDesc *SCDesc = SCModel.getSchedClassDesc(SCClass);
+  // Resolving the variant SchedClass requires an MI to pass to
+  // SubTargetInfo::resolveSchedClass.
+  if (!SCDesc || !SCDesc->isValid() || SCDesc->isVariant())
+    return NoInformationAvailable;
+
+  // Compute output latency.
+  int16_t Latency = 0;
+  for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
+       DefIdx != DefEnd; ++DefIdx) {
+    // Lookup the definition's write latency in SubtargetInfo.
+    const MCWriteLatencyEntry *WLEntry =
+        STI->getWriteLatencyEntry(SCDesc, DefIdx);
+    Latency = std::max(Latency, WLEntry->Cycles);
+  }
+
+  return Latency;
+}
+
 /// emitComments - Pretty-print comments for instructions.
-static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+static void emitComments(const MachineInstr &MI, const MCSubtargetInfo *STI,
+                         raw_ostream &CommentOS) {
   const MachineFunction *MF = MI.getMF();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
@@ -1109,6 +1186,13 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   // Check for spill-induced copies
   if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse))
     CommentOS << " Reload Reuse\n";
+
+  if (PrintLatency) {
+    int Latency = getLatency(MI, STI);
+    // Report only interesting latencies.
+    if (1 < Latency)
+      CommentOS << " Latency: " << Latency << "\n";
+  }
 }
 
 /// emitImplicitDef - This method emits the specified machine instruction
@@ -1750,6 +1834,12 @@ void AsmPrinter::emitFunctionBody() {
   int NumInstsInFunction = 0;
   bool IsEHa = MMI->getModule()->getModuleFlag("eh-asynch");
 
+  const MCSubtargetInfo *STI = nullptr;
+  if (this->MF)
+    STI = &getSubtargetInfo();
+  else
+    STI = TM.getMCSubtargetInfo();
+
   bool CanDoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
   for (auto &MBB : *MF) {
     // Print a label for the basic block.
@@ -1773,7 +1863,7 @@ void AsmPrinter::emitFunctionBody() {
         Handler->beginInstruction(&MI);
 
       if (isVerbose())
-        emitComments(MI, OutStreamer->getCommentOS());
+        emitComments(MI, STI, OutStreamer->getCommentOS());
 
       switch (MI.getOpcode()) {
       case TargetOpcode::CFI_INSTRUCTION:
diff --git a/llvm/test/CodeGen/AArch64/latency.ll b/llvm/test/CodeGen/AArch64/latency.ll
new file mode 100644
index 00000000000000..b722eec3e2571a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/latency.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mcpu=cyclone -asm-print-latency=1 | FileCheck %s --match-full-lines --check-prefix=ON
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mcpu=cyclone -asm-print-latency=0 | FileCheck %s --match-full-lines --check-prefix=OFF
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mcpu=cyclone                      | FileCheck %s --match-full-lines --check-prefix=OFF
+
+define <4 x i64> @load_v4i64(ptr %ptr){
+; ON:     ldp q0, q1, [x0] ; Latency: 4
+; OFF:    ldp q0, q1, [x0]
+  %a = load <4 x i64>, ptr %ptr
+  ret <4 x i64> %a
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/113243