[llvm] [llvm-mca] Add command line option `-use-load-latency` (PR #94566)

Chinmay Deshpande via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 5 21:17:38 PDT 2024


https://github.com/chinmaydd created https://github.com/llvm/llvm-project/pull/94566

Latency for load operations could be modeled better by using the `LoadLatency` field in the provided scheduling model for the target architecture. This PR adds a new command line flag `use-load-latency` that enables this behavior. The flag is off by default.

This attempts a fix for the comment here - [LSUnit.h#370](https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/MCA/HardwareUnits/LSUnit.h#L370). I would be happy to know if there is a better way to approach this.

>From 553c8b2ed8c86657f296acfb4e7faa184e407bb4 Mon Sep 17 00:00:00 2001
From: Chinmay Deshpande <cddeshpa at uci.edu>
Date: Wed, 5 Jun 2024 21:05:07 -0700
Subject: [PATCH] [llvm-mca] Add command line option `-use-load-latency`

Latency for load operations could be modeled better by
using the `LoadLatency` field in the provided scheduling
model for the target architecture.
---
 llvm/include/llvm/MCA/InstrBuilder.h          |  4 +-
 llvm/lib/MCA/InstrBuilder.cpp                 | 18 ++++--
 .../tools/llvm-mca/X86/use-load-latency.s     | 58 +++++++++++++++++++
 llvm/tools/llvm-mca/llvm-mca.cpp              |  8 ++-
 llvm/unittests/tools/llvm-mca/MCATestBase.cpp |  3 +-
 .../tools/llvm-mca/X86/TestIncrementalMCA.cpp |  6 +-
 6 files changed, 87 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/tools/llvm-mca/X86/use-load-latency.s

diff --git a/llvm/include/llvm/MCA/InstrBuilder.h b/llvm/include/llvm/MCA/InstrBuilder.h
index 00c7942e4fa16..6880a213d48b5 100644
--- a/llvm/include/llvm/MCA/InstrBuilder.h
+++ b/llvm/include/llvm/MCA/InstrBuilder.h
@@ -79,6 +79,7 @@ class InstrBuilder {
   bool FirstCallInst;
   bool FirstReturnInst;
   unsigned CallLatency;
+  bool UseLoadLatency;
 
   using InstRecycleCallback = std::function<Instruction *(const InstrDesc &)>;
   InstRecycleCallback InstRecycleCB;
@@ -99,7 +100,8 @@ class InstrBuilder {
 public:
   InstrBuilder(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
                const MCRegisterInfo &RI, const MCInstrAnalysis *IA,
-               const InstrumentManager &IM, unsigned CallLatency);
+               const InstrumentManager &IM, unsigned CallLatency,
+               bool UseLoadLatency);
 
   void clear() {
     Descriptors.clear();
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index d5cbdc5de0b84..db4eda40de9d0 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -31,9 +31,10 @@ InstrBuilder::InstrBuilder(const llvm::MCSubtargetInfo &sti,
                            const llvm::MCInstrInfo &mcii,
                            const llvm::MCRegisterInfo &mri,
                            const llvm::MCInstrAnalysis *mcia,
-                           const mca::InstrumentManager &im, unsigned cl)
+                           const mca::InstrumentManager &im, unsigned cl,
+                           bool ull)
     : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia), IM(im), FirstCallInst(true),
-      FirstReturnInst(true), CallLatency(cl) {
+      FirstReturnInst(true), CallLatency(cl), UseLoadLatency(ull) {
   const MCSchedModel &SM = STI.getSchedModel();
   ProcResourceMasks.resize(SM.getNumProcResourceKinds());
   computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
@@ -220,8 +221,8 @@ static void initializeUsedResources(InstrDesc &ID,
 
 static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
                               const MCSchedClassDesc &SCDesc,
-                              const MCSubtargetInfo &STI,
-                              unsigned CallLatency) {
+                              const MCSubtargetInfo &STI, unsigned CallLatency,
+                              bool UseLoadLatency) {
   if (MCDesc.isCall()) {
     // We cannot estimate how long this call will take.
     // Artificially set an arbitrarily high latency.
@@ -230,6 +231,13 @@ static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
   }
 
   int Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
+
+  // If `UseLoadLatency` is set, we use the value in `MCSchedModel::LoadLatency`
+  // for load instructions.
+  if (MCDesc.mayLoad() && UseLoadLatency) {
+    const auto &SM = STI.getSchedModel();
+    Latency = std::max(int(SM.LoadLatency), Latency);
+  }
   // If latency is unknown, then conservatively assume the MaxLatency set for
   // calls.
   ID.MaxLatency = Latency < 0 ? CallLatency : static_cast<unsigned>(Latency);
@@ -582,7 +590,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI,
   }
 
   initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
-  computeMaxLatency(*ID, MCDesc, SCDesc, STI, CallLatency);
+  computeMaxLatency(*ID, MCDesc, SCDesc, STI, CallLatency, UseLoadLatency);
 
   if (Error Err = verifyOperands(MCDesc, MCI))
     return std::move(Err);
diff --git a/llvm/test/tools/llvm-mca/X86/use-load-latency.s b/llvm/test/tools/llvm-mca/X86/use-load-latency.s
new file mode 100644
index 0000000000000..54c37c5c8699f
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/use-load-latency.s
@@ -0,0 +1,58 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2                   -iterations=1 %s | FileCheck --check-prefixes=ALL,DEFAULT %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -use-load-latency -iterations=1 %s | FileCheck --check-prefixes=ALL,CUSTOM %s
+
+movq      (%rsp), %rdi
+
+# ALL:          Iterations:        1
+# ALL-NEXT:     Instructions:      1
+
+# CUSTOM-NEXT:  Total Cycles:      8
+# DEFAULT-NEXT: Total Cycles:      6
+
+# ALL-NEXT:     Total uOps:        1
+
+# ALL:          Dispatch Width:    2
+
+# CUSTOM-NEXT:  uOps Per Cycle:    0.13
+# CUSTOM-NEXT:  IPC:               0.13
+
+# DEFAULT-NEXT: uOps Per Cycle:    0.17
+# DEFAULT-NEXT: IPC:               0.17
+
+# ALL-NEXT:     Block RThroughput: 1.0
+
+# ALL:          Instruction Info:
+# ALL-NEXT:     [1]: #uOps
+# ALL-NEXT:     [2]: Latency
+# ALL-NEXT:     [3]: RThroughput
+# ALL-NEXT:     [4]: MayLoad
+# ALL-NEXT:     [5]: MayStore
+# ALL-NEXT:     [6]: HasSideEffects (U)
+
+# ALL:          [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# ALL-NEXT:      1      3     1.00    *                   movq	(%rsp), %rdi
+
+# ALL:          Resources:
+# ALL-NEXT:     [0]   - JALU0
+# ALL-NEXT:     [1]   - JALU1
+# ALL-NEXT:     [2]   - JDiv
+# ALL-NEXT:     [3]   - JFPA
+# ALL-NEXT:     [4]   - JFPM
+# ALL-NEXT:     [5]   - JFPU0
+# ALL-NEXT:     [6]   - JFPU1
+# ALL-NEXT:     [7]   - JLAGU
+# ALL-NEXT:     [8]   - JMul
+# ALL-NEXT:     [9]   - JSAGU
+# ALL-NEXT:     [10]  - JSTC
+# ALL-NEXT:     [11]  - JVALU0
+# ALL-NEXT:     [12]  - JVALU1
+# ALL-NEXT:     [13]  - JVIMUL
+
+# ALL:          Resource pressure per iteration:
+# ALL-NEXT:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# ALL-NEXT:      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -
+
+# ALL:          Resource pressure by instruction:
+# ALL-NEXT:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# ALL-NEXT:      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     movq	(%rsp), %rdi
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index cc5d4f5fa05de..927b2b97c4d68 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -140,6 +140,11 @@ static cl::opt<unsigned>
                 cl::desc("Number of cycles to assume for a call instruction"),
                 cl::cat(ToolOptions), cl::init(100U));
 
+static cl::opt<bool> UseLoadLatency(
+    "use-load-latency", cl::Hidden,
+    cl::desc("Use target specific latency for load instructions"),
+    cl::cat(ToolOptions), cl::init(false));
+
 enum class SkipType { NONE, LACK_SCHED, PARSE_FAILURE, ANY_FAILURE };
 
 static cl::opt<enum SkipType> SkipUnsupportedInstructions(
@@ -573,7 +578,8 @@ int main(int argc, char **argv) {
   }
 
   // Create an instruction builder.
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, CallLatency);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, CallLatency,
+                       UseLoadLatency);
 
   // Create a context to control ownership of the pipeline hardware.
   mca::Context MCA(*MRI, *STI);
diff --git a/llvm/unittests/tools/llvm-mca/MCATestBase.cpp b/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
index 4a39f5e663f23..515fbe5bd6b69 100644
--- a/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
+++ b/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
@@ -66,7 +66,8 @@ Error MCATestBase::runBaselineMCA(json::Object &Result, ArrayRef<MCInst> Insts,
 
   // Default InstrumentManager
   auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100,
+                       /*UseLoadLatency*/ false);
 
   const SmallVector<mca::Instrument *> Instruments;
   SmallVector<std::unique_ptr<mca::Instruction>> LoweredInsts;
diff --git a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
index ac35dce522ae1..80a2f49ca4aa4 100644
--- a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
+++ b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
@@ -33,7 +33,8 @@ TEST_F(X86TestBase, TestResumablePipeline) {
   P->addEventListener(SV.get());
 
   auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100,
+                       /*UseLoadLatency*/ false);
 
   const SmallVector<mca::Instrument *> Instruments;
   // Tile size = 7
@@ -124,7 +125,8 @@ TEST_F(X86TestBase, TestInstructionRecycling) {
   // Default InstrumentManager
   auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
 
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100,
+                       /*UseLoadLatency*/ false);
   IB.setInstRecycleCallback(GetRecycledInst);
 
   const SmallVector<mca::Instrument *> Instruments;



More information about the llvm-commits mailing list