[llvm] [llvm][AsmPrinter] Add an option to print instruction latencies (PR #113243)

Tue Oct 22 19:24:18 PDT 2024

https://github.com/jroelofs updated https://github.com/llvm/llvm-project/pull/113243

>From eede159a4a5d7787e116f1f853125daf5ab4be5b Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Mon, 21 Oct 2024 17:52:35 -0700
Subject: [PATCH 1/4] [llvm][AsmPrinter] Add an option to print instruction
 latencies

... matching what we have in the disassembler.  This isn't turned on by default
since several of the scheduling models are not completely accurate, and we
don't want to be misleading.
---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 94 +++++++++++++++++++++-
 llvm/test/CodeGen/AArch64/latency.ll       | 10 +++
 2 files changed, 102 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/latency.ll

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 327e7f7f8a1ed8..015c4cc3d4b721 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -162,6 +162,13 @@ static cl::opt<bool> EmitJumpTableSizesSection(
     cl::desc("Emit a section containing jump table addresses and sizes"),
     cl::Hidden, cl::init(false));
 
+// This isn't turned on by default, since several of the scheduling models are
+// not completely accurate, and we don't want to be misleading.
+static cl::opt<bool> PrintLatency(
+    "asm-print-latency",
+    cl::desc("Print instruction latencies as verbose asm comments."),
+    cl::Hidden, cl::init(false));
+
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
 char AsmPrinter::ID = 0;
@@ -1080,8 +1087,78 @@ void AsmPrinter::emitFunctionEntryLabel() {
   }
 }
 
+/// Gets latency information for \p Inst from the itinerary
+/// scheduling model.
+/// \return The maximum expected latency over all the operands or -1
+/// if no information is available.
+static int getItineraryLatency(const MachineInstr &MI,
+                               const MachineFunction *MF,
+                               const MCSubtargetInfo *STI) {
+  const int NoInformationAvailable = -1;
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  // Check if we have a CPU to get the itinerary information.
+  if (STI->getCPU().empty())
+    return NoInformationAvailable;
+
+  // Get itinerary information.
+  InstrItineraryData IID = STI->getInstrItineraryForCPU(STI->getCPU());
+  // Get the scheduling class of the requested instruction.
+  const MCInstrDesc &Desc = TII->get(MI.getOpcode());
+  unsigned SCClass = Desc.getSchedClass();
+
+  unsigned Latency = 0;
+
+  for (unsigned Idx = 0, IdxEnd = MI.getNumOperands(); Idx != IdxEnd; ++Idx)
+    if (std::optional<unsigned> OperCycle = IID.getOperandCycle(SCClass, Idx))
+      Latency = std::max(Latency, *OperCycle);
+
+  return (int)Latency;
+}
+
+/// Gets latency information for \p Inst.
+/// \return The maximum expected latency over all the definitions or -1
+/// if no information is available.
+static int getLatency(const MachineInstr &MI, const MCSubtargetInfo *STI) {
+  const MCSchedModel SCModel = STI->getSchedModel();
+  const int NoInformationAvailable = -1;
+
+  const MachineFunction *MF = MI.getMF();
+  if (!MF)
+    return NoInformationAvailable;
+
+  // Check if we have a scheduling model for instructions.
+  if (!SCModel.hasInstrSchedModel())
+    // Try to fall back to the itinerary model if the scheduling model doesn't
+    // have a scheduling table.  Note the default does not have a table.
+    return getItineraryLatency(MI, MF, STI);
+
+  // Get the scheduling class of the requested instruction.
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const MCInstrDesc &Desc = TII->get(MI.getOpcode());
+  unsigned SCClass = Desc.getSchedClass();
+  const MCSchedClassDesc *SCDesc = SCModel.getSchedClassDesc(SCClass);
+  // Resolving the variant SchedClass requires an MI to pass to
+  // SubTargetInfo::resolveSchedClass.
+  if (!SCDesc || !SCDesc->isValid() || SCDesc->isVariant())
+    return NoInformationAvailable;
+
+  // Compute output latency.
+  int16_t Latency = 0;
+  for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
+       DefIdx != DefEnd; ++DefIdx) {
+    // Lookup the definition's write latency in SubtargetInfo.
+    const MCWriteLatencyEntry *WLEntry =
+        STI->getWriteLatencyEntry(SCDesc, DefIdx);
+    Latency = std::max(Latency, WLEntry->Cycles);
+  }
+
+  return Latency;
+}
+
 /// emitComments - Pretty-print comments for instructions.
-static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+static void emitComments(const MachineInstr &MI, const MCSubtargetInfo *STI,
+                         raw_ostream &CommentOS) {
   const MachineFunction *MF = MI.getMF();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
@@ -1109,6 +1186,13 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   // Check for spill-induced copies
   if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse))
     CommentOS << " Reload Reuse\n";
+
+  if (PrintLatency) {
+    int Latency = getLatency(MI, STI);
+    // Report only interesting latencies.
+    if (1 < Latency)
+      CommentOS << " Latency: " << Latency << "\n";
+  }
 }
 
 /// emitImplicitDef - This method emits the specified machine instruction
@@ -1750,6 +1834,12 @@ void AsmPrinter::emitFunctionBody() {
   int NumInstsInFunction = 0;
   bool IsEHa = MMI->getModule()->getModuleFlag("eh-asynch");
 
+  const MCSubtargetInfo *STI = nullptr;
+  if (this->MF)
+    STI = &getSubtargetInfo();
+  else
+    STI = TM.getMCSubtargetInfo();
+
   bool CanDoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
   for (auto &MBB : *MF) {
     // Print a label for the basic block.
@@ -1773,7 +1863,7 @@ void AsmPrinter::emitFunctionBody() {
         Handler->beginInstruction(&MI);
 
       if (isVerbose())
-        emitComments(MI, OutStreamer->getCommentOS());
+        emitComments(MI, STI, OutStreamer->getCommentOS());
 
       switch (MI.getOpcode()) {
       case TargetOpcode::CFI_INSTRUCTION:
diff --git a/llvm/test/CodeGen/AArch64/latency.ll b/llvm/test/CodeGen/AArch64/latency.ll
new file mode 100644
index 00000000000000..b722eec3e2571a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/latency.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mcpu=cyclone -asm-print-latency=1 | FileCheck %s --match-full-lines --check-prefix=ON
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mcpu=cyclone -asm-print-latency=0 | FileCheck %s --match-full-lines --check-prefix=OFF
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mcpu=cyclone                      | FileCheck %s --match-full-lines --check-prefix=OFF
+
+define <4 x i64> @load_v4i64(ptr %ptr){
+; ON:     ldp q0, q1, [x0] ; Latency: 4
+; OFF:    ldp q0, q1, [x0]
+  %a = load <4 x i64>, ptr %ptr
+  ret <4 x i64> %a
+}

>From 62131f6e4685510daca04685e99ed3994b8a485e Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Tue, 22 Oct 2024 19:08:08 -0700
Subject: [PATCH 2/4] add a test for a cpu with itineraries

---
 llvm/test/CodeGen/ARM/latency.ll | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 llvm/test/CodeGen/ARM/latency.ll

diff --git a/llvm/test/CodeGen/ARM/latency.ll b/llvm/test/CodeGen/ARM/latency.ll
new file mode 100644
index 00000000000000..ada6254666fdef
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/latency.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=thumb-none-eabi %s -o - -mcpu=cortex-m0 -asm-print-latency=1 | FileCheck %s --match-full-lines --check-prefix=ON
+; RUN: llc -mtriple=thumb-none-eabi %s -o - -mcpu=cortex-m0 -asm-print-latency=0 | FileCheck %s --match-full-lines --check-prefix=OFF
+; RUN: llc -mtriple=thumb-none-eabi %s -o - -mcpu=cortex-m0                      | FileCheck %s --match-full-lines --check-prefix=OFF
+
+define i64 @load_i64(ptr %ptr){
+; ON:   ldr     r2, [r0]                        @  Latency: 4
+; ON:   ldr     r1, [r0, #4]                    @  Latency: 4
+; ON:   mov     r0, r2                          @  Latency: 2
+; ON:   bx      lr
+; OFF:  ldr     r2, [r0]
+; OFF:  ldr     r1, [r0, #4]
+; OFF:  mov     r0, r2
+; OFf:  bx      lr
+  %a = load i64, ptr %ptr
+  ret i64 %a
+}

>From 1aa5800f247a460dd252633711aa8fb5826d4b7d Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Tue, 22 Oct 2024 19:18:57 -0700
Subject: [PATCH 3/4] use MCSchedModel::computeInstrLatency

---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp  | 37 +++++++--------------
 llvm/lib/MC/MCDisassembler/Disassembler.cpp |  2 +-
 2 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 015c4cc3d4b721..e6e3a668c6070b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1091,15 +1091,14 @@ void AsmPrinter::emitFunctionEntryLabel() {
 /// scheduling model.
 /// \return The maximum expected latency over all the operands or -1
 /// if no information is available.
-static int getItineraryLatency(const MachineInstr &MI,
+static std::optional<int> getItineraryLatency(const MachineInstr &MI,
                                const MachineFunction *MF,
                                const MCSubtargetInfo *STI) {
-  const int NoInformationAvailable = -1;
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
   // Check if we have a CPU to get the itinerary information.
   if (STI->getCPU().empty())
-    return NoInformationAvailable;
+    return std::nullopt;
 
   // Get itinerary information.
   InstrItineraryData IID = STI->getInstrItineraryForCPU(STI->getCPU());
@@ -1113,14 +1112,14 @@ static int getItineraryLatency(const MachineInstr &MI,
     if (std::optional<unsigned> OperCycle = IID.getOperandCycle(SCClass, Idx))
       Latency = std::max(Latency, *OperCycle);
 
-  return (int)Latency;
+  return int(Latency);
 }
 
 /// Gets latency information for \p Inst.
 /// \return The maximum expected latency over all the definitions or -1
 /// if no information is available.
-static int getLatency(const MachineInstr &MI, const MCSubtargetInfo *STI) {
-  const MCSchedModel SCModel = STI->getSchedModel();
+static std::optional<int> getLatency(const MachineInstr &MI, const MCSubtargetInfo *STI) {
+  const MCSchedModel &SCModel = STI->getSchedModel();
   const int NoInformationAvailable = -1;
 
   const MachineFunction *MF = MI.getMF();
@@ -1137,22 +1136,9 @@ static int getLatency(const MachineInstr &MI, const MCSubtargetInfo *STI) {
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   const MCInstrDesc &Desc = TII->get(MI.getOpcode());
   unsigned SCClass = Desc.getSchedClass();
-  const MCSchedClassDesc *SCDesc = SCModel.getSchedClassDesc(SCClass);
-  // Resolving the variant SchedClass requires an MI to pass to
-  // SubTargetInfo::resolveSchedClass.
-  if (!SCDesc || !SCDesc->isValid() || SCDesc->isVariant())
+  int Latency = SCModel.computeInstrLatency(*STI, SCClass);
+  if (Latency <= 0)
     return NoInformationAvailable;
-
-  // Compute output latency.
-  int16_t Latency = 0;
-  for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
-       DefIdx != DefEnd; ++DefIdx) {
-    // Lookup the definition's write latency in SubtargetInfo.
-    const MCWriteLatencyEntry *WLEntry =
-        STI->getWriteLatencyEntry(SCDesc, DefIdx);
-    Latency = std::max(Latency, WLEntry->Cycles);
-  }
-
   return Latency;
 }
 
@@ -1188,10 +1174,11 @@ static void emitComments(const MachineInstr &MI, const MCSubtargetInfo *STI,
     CommentOS << " Reload Reuse\n";
 
   if (PrintLatency) {
-    int Latency = getLatency(MI, STI);
-    // Report only interesting latencies.
-    if (1 < Latency)
-      CommentOS << " Latency: " << Latency << "\n";
+    if (auto Latency = getLatency(MI, STI)) {
+      // Report only interesting latencies.
+      if (1 < *Latency)
+        CommentOS << " Latency: " << *Latency << "\n";
+    }
   }
 }
 
diff --git a/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
index 5e5a163c290244..fbe487d8da0786 100644
--- a/llvm/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
@@ -195,7 +195,7 @@ static int getItineraryLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
 static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
   // Try to compute scheduling information.
   const MCSubtargetInfo *STI = DC->getSubtargetInfo();
-  const MCSchedModel SCModel = STI->getSchedModel();
+  const MCSchedModel &SCModel = STI->getSchedModel();
   const int NoInformationAvailable = -1;
 
   // Check if we have a scheduling model for instructions.

>From 9c59ee34d26d52e594139a0ffc583dba409d239e Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Tue, 22 Oct 2024 19:23:50 -0700
Subject: [PATCH 4/4] simplify both implementations

---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp  |  5 ++---
 llvm/lib/MC/MCDisassembler/Disassembler.cpp | 23 +++------------------
 2 files changed, 5 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index e6e3a668c6070b..48d6577db6f6d0 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1120,11 +1120,10 @@ static std::optional<int> getItineraryLatency(const MachineInstr &MI,
 /// if no information is available.
 static std::optional<int> getLatency(const MachineInstr &MI, const MCSubtargetInfo *STI) {
   const MCSchedModel &SCModel = STI->getSchedModel();
-  const int NoInformationAvailable = -1;
 
   const MachineFunction *MF = MI.getMF();
   if (!MF)
-    return NoInformationAvailable;
+    return std::nullopt;
 
   // Check if we have a scheduling model for instructions.
   if (!SCModel.hasInstrSchedModel())
@@ -1138,7 +1137,7 @@ static std::optional<int> getLatency(const MachineInstr &MI, const MCSubtargetIn
   unsigned SCClass = Desc.getSchedClass();
   int Latency = SCModel.computeInstrLatency(*STI, SCClass);
   if (Latency <= 0)
-    return NoInformationAvailable;
+    return std::nullopt;
   return Latency;
 }
 
diff --git a/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
index fbe487d8da0786..b99ab7d1511123 100644
--- a/llvm/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
@@ -186,7 +186,7 @@ static int getItineraryLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
     if (std::optional<unsigned> OperCycle = IID.getOperandCycle(SCClass, Idx))
       Latency = std::max(Latency, *OperCycle);
 
-  return (int)Latency;
+  return int(Latency);
 }
 
 /// Gets latency information for \p Inst, based on \p DC information.
@@ -196,7 +196,6 @@ static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
   // Try to compute scheduling information.
   const MCSubtargetInfo *STI = DC->getSubtargetInfo();
   const MCSchedModel &SCModel = STI->getSchedModel();
-  const int NoInformationAvailable = -1;
 
   // Check if we have a scheduling model for instructions.
   if (!SCModel.hasInstrSchedModel())
@@ -205,25 +204,9 @@ static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
     return getItineraryLatency(DC, Inst);
 
   // Get the scheduling class of the requested instruction.
-  const MCInstrDesc& Desc = DC->getInstrInfo()->get(Inst.getOpcode());
+  const MCInstrDesc &Desc = DC->getInstrInfo()->get(Inst.getOpcode());
   unsigned SCClass = Desc.getSchedClass();
-  const MCSchedClassDesc *SCDesc = SCModel.getSchedClassDesc(SCClass);
-  // Resolving the variant SchedClass requires an MI to pass to
-  // SubTargetInfo::resolveSchedClass.
-  if (!SCDesc || !SCDesc->isValid() || SCDesc->isVariant())
-    return NoInformationAvailable;
-
-  // Compute output latency.
-  int16_t Latency = 0;
-  for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
-       DefIdx != DefEnd; ++DefIdx) {
-    // Lookup the definition's write latency in SubtargetInfo.
-    const MCWriteLatencyEntry *WLEntry = STI->getWriteLatencyEntry(SCDesc,
-                                                                   DefIdx);
-    Latency = std::max(Latency, WLEntry->Cycles);
-  }
-
-  return Latency;
+  return SCModel.computeInstrLatency(*STI, SCClass);
 }
 
 /// Emits latency information in DC->CommentStream for \p Inst, based