[llvm] [MCA][X86] Pretend To Have a Stack Engine (PR #153348)

Aiden Grossman via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 14 09:03:29 PDT 2025


https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/153348

>From aa7076ca0b50e7152bb015bf543e1c182f3c58c3 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman at google.com>
Date: Wed, 13 Aug 2025 05:16:40 +0000
Subject: [PATCH] [MCA][X86] Pretend To Have a Stack Engine

This patch removes RSP dependencies from push and pop instructions to
pretend that we have a stack engine. This does not model details like
sync uops that are relevant implementation details due to complexity.
This is just enabled on all X86 CPUs given LLVM does not have a
scheduling model for any X86 CPU that does not have a stack engine.

This fixes #152008.
---
 .../lib/Target/X86/MCA/X86CustomBehaviour.cpp | 15 +++
 llvm/lib/Target/X86/MCA/X86CustomBehaviour.h  |  5 +
 .../tools/llvm-mca/X86/stack-engine-pop.s     | 92 +++++++++++++++++++
 .../tools/llvm-mca/X86/stack-engine-push.s    | 92 +++++++++++++++++++
 4 files changed, 204 insertions(+)
 create mode 100644 llvm/test/tools/llvm-mca/X86/stack-engine-pop.s
 create mode 100644 llvm/test/tools/llvm-mca/X86/stack-engine-push.s

diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
index 817e88d8a0bc0..71cb49330e542 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
@@ -36,11 +36,26 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst,
   }
 }
 
+void X86InstrPostProcess::useStackEngine(std::unique_ptr<Instruction> &Inst,
+                                         const MCInst &MCI) {
+  if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) {
+    auto *StackRegisterDef =
+        llvm::find_if(Inst->getDefs(), [](const WriteState &State) {
+          return State.getRegisterID() == X86::RSP;
+        });
+    assert(
+        StackRegisterDef != Inst->getDefs().end() &&
+        "Expected push instruction to implicitly use stack pointer register.");
+    Inst->getDefs().erase(StackRegisterDef);
+  }
+}
+
 void X86InstrPostProcess::postProcessInstruction(
     std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
   // Currently, we only modify certain instructions' IsALoadBarrier and
   // IsAStoreBarrier flags.
   setMemBarriers(Inst, MCI);
+  useStackEngine(Inst, MCI);
 }
 
 } // namespace mca
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
index 4a83ba848dd88..c5459e42dfc9f 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess {
   /// as load and store barriers.
   void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
 
+  /// Called within X86InstrPostPorcess to remove some rsp read operands
+  /// on stack instructions to better simulate the stack engine. We currently
+  /// do not model features of the stack engine like sync uops.
+  void useStackEngine(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
 public:
   X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
       : InstrPostProcess(STI, MCII) {}
diff --git a/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s b/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s
new file mode 100644
index 0000000000000..2ffb52ae61fc4
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=2 < %s | FileCheck %s
+
+movq $0x80, %rsp
+popq %rax
+popq %rcx
+popq %rdx
+popq %rbx
+popq %r12
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      12
+# CHECK-NEXT: Total Cycles:      14
+# CHECK-NEXT: Total uOps:        22
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    1.57
+# CHECK-NEXT: IPC:               0.86
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.25                        movq	$128, %rsp
+# CHECK-NEXT:  2      6     0.50    *                   popq	%rax
+# CHECK-NEXT:  2      6     0.50    *                   popq	%rcx
+# CHECK-NEXT:  2      6     0.50    *                   popq	%rdx
+# CHECK-NEXT:  2      6     0.50    *                   popq	%rbx
+# CHECK-NEXT:  2      6     0.50    *                   popq	%r12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SKLDivider
+# CHECK-NEXT: [1]   - SKLFPDivider
+# CHECK-NEXT: [2]   - SKLPort0
+# CHECK-NEXT: [3]   - SKLPort1
+# CHECK-NEXT: [4]   - SKLPort2
+# CHECK-NEXT: [5]   - SKLPort3
+# CHECK-NEXT: [6]   - SKLPort4
+# CHECK-NEXT: [7]   - SKLPort5
+# CHECK-NEXT: [8]   - SKLPort6
+# CHECK-NEXT: [9]   - SKLPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     1.50   1.50   2.50   2.50    -     1.50   1.50    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -     movq	$128, %rsp
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     popq	%rax
+# CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -      -     0.50    -     popq	%rcx
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     popq	%rdx
+# CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -      -     0.50    -     popq	%rbx
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     popq	%r12
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .  .   movq	$128, %rsp
+# CHECK-NEXT: [0,1]     D=eeeeeeER.  .   popq	%rax
+# CHECK-NEXT: [0,2]     D=eeeeeeER.  .   popq	%rcx
+# CHECK-NEXT: [0,3]     .D=eeeeeeER  .   popq	%rdx
+# CHECK-NEXT: [0,4]     .D=eeeeeeER  .   popq	%rbx
+# CHECK-NEXT: [0,5]     .D==eeeeeeER .   popq	%r12
+# CHECK-NEXT: [1,0]     . DeE------R .   movq	$128, %rsp
+# CHECK-NEXT: [1,1]     . D=eeeeeeER .   popq	%rax
+# CHECK-NEXT: [1,2]     . D==eeeeeeER.   popq	%rcx
+# CHECK-NEXT: [1,3]     .  D=eeeeeeER.   popq	%rdx
+# CHECK-NEXT: [1,4]     .  D==eeeeeeER   popq	%rbx
+# CHECK-NEXT: [1,5]     .  D==eeeeeeER   popq	%r12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     1.0    1.0    3.0       movq	$128, %rsp
+# CHECK-NEXT: 1.     2     2.0    0.0    0.0       popq	%rax
+# CHECK-NEXT: 2.     2     2.5    0.5    0.0       popq	%rcx
+# CHECK-NEXT: 3.     2     2.0    1.0    0.0       popq	%rdx
+# CHECK-NEXT: 4.     2     2.5    1.5    0.0       popq	%rbx
+# CHECK-NEXT: 5.     2     3.0    2.0    0.0       popq	%r12
+# CHECK-NEXT:        2     2.2    1.0    0.5       <total>
diff --git a/llvm/test/tools/llvm-mca/X86/stack-engine-push.s b/llvm/test/tools/llvm-mca/X86/stack-engine-push.s
new file mode 100644
index 0000000000000..fc394d4c1e7d3
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/stack-engine-push.s
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=2 < %s | FileCheck %s
+
+movq $0x80, %rsp
+pushq %rax
+pushq %rcx
+pushq %rdx
+pushq %rbx
+pushq %r12
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      12
+# CHECK-NEXT: Total Cycles:      15
+# CHECK-NEXT: Total uOps:        32
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    2.13
+# CHECK-NEXT: IPC:               0.80
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.25                        movq	$128, %rsp
+# CHECK-NEXT:  3      2     1.00           *            pushq	%rax
+# CHECK-NEXT:  3      2     1.00           *            pushq	%rcx
+# CHECK-NEXT:  3      2     1.00           *            pushq	%rdx
+# CHECK-NEXT:  3      2     1.00           *            pushq	%rbx
+# CHECK-NEXT:  3      2     1.00           *            pushq	%r12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SKLDivider
+# CHECK-NEXT: [1]   - SKLFPDivider
+# CHECK-NEXT: [2]   - SKLPort0
+# CHECK-NEXT: [3]   - SKLPort1
+# CHECK-NEXT: [4]   - SKLPort2
+# CHECK-NEXT: [5]   - SKLPort3
+# CHECK-NEXT: [6]   - SKLPort4
+# CHECK-NEXT: [7]   - SKLPort5
+# CHECK-NEXT: [8]   - SKLPort6
+# CHECK-NEXT: [9]   - SKLPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     1.50   1.50   1.50   1.50   5.00   1.50   1.50   2.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -     movq	$128, %rsp
+# CHECK-NEXT:  -      -     0.50    -     0.50    -     1.00   0.50    -     0.50   pushq	%rax
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   1.00    -     0.50   0.50   pushq	%rcx
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50   1.00   0.50    -      -     pushq	%rdx
+# CHECK-NEXT:  -      -      -     0.50   0.50    -     1.00   0.50    -     0.50   pushq	%rbx
+# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   1.00    -      -     0.50   pushq	%r12
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .   .   movq	$128, %rsp
+# CHECK-NEXT: [0,1]     D=eeER    .   .   pushq	%rax
+# CHECK-NEXT: [0,2]     .D=eeER   .   .   pushq	%rcx
+# CHECK-NEXT: [0,3]     .D==eeER  .   .   pushq	%rdx
+# CHECK-NEXT: [0,4]     . D==eeER .   .   pushq	%rbx
+# CHECK-NEXT: [0,5]     . D===eeER.   .   pushq	%r12
+# CHECK-NEXT: [1,0]     .  DeE---R.   .   movq	$128, %rsp
+# CHECK-NEXT: [1,1]     .  D===eeER   .   pushq	%rax
+# CHECK-NEXT: [1,2]     .   D===eeER  .   pushq	%rcx
+# CHECK-NEXT: [1,3]     .   D====eeER .   pushq	%rdx
+# CHECK-NEXT: [1,4]     .    D====eeER.   pushq	%rbx
+# CHECK-NEXT: [1,5]     .    D=====eeER   pushq	%r12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     1.0    1.0    1.5       movq	$128, %rsp
+# CHECK-NEXT: 1.     2     3.0    0.5    0.0       pushq	%rax
+# CHECK-NEXT: 2.     2     3.0    1.0    0.0       pushq	%rcx
+# CHECK-NEXT: 3.     2     4.0    1.0    0.0       pushq	%rdx
+# CHECK-NEXT: 4.     2     4.0    1.0    0.0       pushq	%rbx
+# CHECK-NEXT: 5.     2     5.0    1.0    0.0       pushq	%r12
+# CHECK-NEXT:        2     3.3    0.9    0.3       <total>



More information about the llvm-commits mailing list