[llvm] 5b2c3aa - [MCA][X86] Pretend To Have a Stack Engine (#153348)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 18 06:44:47 PDT 2025
Author: Aiden Grossman
Date: 2025-08-18T13:44:43Z
New Revision: 5b2c3aac90450ecb78394f61afc7e9c5e955abc7
URL: https://github.com/llvm/llvm-project/commit/5b2c3aac90450ecb78394f61afc7e9c5e955abc7
DIFF: https://github.com/llvm/llvm-project/commit/5b2c3aac90450ecb78394f61afc7e9c5e955abc7.diff
LOG: [MCA][X86] Pretend To Have a Stack Engine (#153348)
This patch removes RSP dependencies from push and pop instructions to
pretend that we have a stack engine. This does not model details like
sync uops that are relevant implementation details due to complexity.
This is just enabled on all X86 CPUs given LLVM does not have a
scheduling model for any X86 CPU that does not have a stack engine.
This fixes #152008.
Added:
llvm/test/tools/llvm-mca/X86/stack-engine-pop.s
llvm/test/tools/llvm-mca/X86/stack-engine-push.s
Modified:
llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
index 817e88d8a0bc0..e2a1bbf383b3c 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
@@ -36,11 +36,31 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst,
}
}
+void X86InstrPostProcess::useStackEngine(std::unique_ptr<Instruction> &Inst,
+ const MCInst &MCI) {
+ // TODO(boomanaiden154): We currently do not handle PUSHF/POPF because we
+ // have not done the necessary benchmarking to see if they are also
+ // optimized by the stack engine.
+ // TODO: We currently just remove all RSP writes from stack operations. This
+ // is not fully correct because we do not model sync uops which will
+ // delay subsequent rsp using non-stack instructions.
+ if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) {
+ auto *StackRegisterDef =
+ llvm::find_if(Inst->getDefs(), [](const WriteState &State) {
+ return State.getRegisterID() == X86::RSP;
+ });
+ assert(
+ StackRegisterDef != Inst->getDefs().end() &&
+ "Expected push instruction to implicitly use stack pointer register.");
+ Inst->getDefs().erase(StackRegisterDef);
+ }
+}
+
void X86InstrPostProcess::postProcessInstruction(
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
- // Currently, we only modify certain instructions' IsALoadBarrier and
- // IsAStoreBarrier flags.
+ // Set IsALoadBarrier and IsAStoreBarrier flags.
setMemBarriers(Inst, MCI);
+ useStackEngine(Inst, MCI);
}
} // namespace mca
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
index 4a83ba848dd88..c5459e42dfc9f 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess {
/// as load and store barriers.
void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+ /// Called within X86InstrPostPorcess to remove some rsp read operands
+ /// on stack instructions to better simulate the stack engine. We currently
+ /// do not model features of the stack engine like sync uops.
+ void useStackEngine(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
public:
X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
: InstrPostProcess(STI, MCII) {}
diff --git a/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s b/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s
new file mode 100644
index 0000000000000..2ffb52ae61fc4
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=2 < %s | FileCheck %s
+
+movq $0x80, %rsp
+popq %rax
+popq %rcx
+popq %rdx
+popq %rbx
+popq %r12
+
+# CHECK: Iterations: 2
+# CHECK-NEXT: Instructions: 12
+# CHECK-NEXT: Total Cycles: 14
+# CHECK-NEXT: Total uOps: 22
+
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 1.57
+# CHECK-NEXT: IPC: 0.86
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 1 0.25 movq $128, %rsp
+# CHECK-NEXT: 2 6 0.50 * popq %rax
+# CHECK-NEXT: 2 6 0.50 * popq %rcx
+# CHECK-NEXT: 2 6 0.50 * popq %rdx
+# CHECK-NEXT: 2 6 0.50 * popq %rbx
+# CHECK-NEXT: 2 6 0.50 * popq %r12
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - SKLDivider
+# CHECK-NEXT: [1] - SKLFPDivider
+# CHECK-NEXT: [2] - SKLPort0
+# CHECK-NEXT: [3] - SKLPort1
+# CHECK-NEXT: [4] - SKLPort2
+# CHECK-NEXT: [5] - SKLPort3
+# CHECK-NEXT: [6] - SKLPort4
+# CHECK-NEXT: [7] - SKLPort5
+# CHECK-NEXT: [8] - SKLPort6
+# CHECK-NEXT: [9] - SKLPort7
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
+# CHECK-NEXT: - - 1.50 1.50 2.50 2.50 - 1.50 1.50 -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
+# CHECK-NEXT: - - - - - - - 0.50 0.50 - movq $128, %rsp
+# CHECK-NEXT: - - 0.50 - 0.50 0.50 - 0.50 - - popq %rax
+# CHECK-NEXT: - - - 0.50 0.50 0.50 - - 0.50 - popq %rcx
+# CHECK-NEXT: - - 0.50 - 0.50 0.50 - 0.50 - - popq %rdx
+# CHECK-NEXT: - - - 0.50 0.50 0.50 - - 0.50 - popq %rbx
+# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - - - - popq %r12
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeER . . . movq $128, %rsp
+# CHECK-NEXT: [0,1] D=eeeeeeER. . popq %rax
+# CHECK-NEXT: [0,2] D=eeeeeeER. . popq %rcx
+# CHECK-NEXT: [0,3] .D=eeeeeeER . popq %rdx
+# CHECK-NEXT: [0,4] .D=eeeeeeER . popq %rbx
+# CHECK-NEXT: [0,5] .D==eeeeeeER . popq %r12
+# CHECK-NEXT: [1,0] . DeE------R . movq $128, %rsp
+# CHECK-NEXT: [1,1] . D=eeeeeeER . popq %rax
+# CHECK-NEXT: [1,2] . D==eeeeeeER. popq %rcx
+# CHECK-NEXT: [1,3] . D=eeeeeeER. popq %rdx
+# CHECK-NEXT: [1,4] . D==eeeeeeER popq %rbx
+# CHECK-NEXT: [1,5] . D==eeeeeeER popq %r12
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 1.0 1.0 3.0 movq $128, %rsp
+# CHECK-NEXT: 1. 2 2.0 0.0 0.0 popq %rax
+# CHECK-NEXT: 2. 2 2.5 0.5 0.0 popq %rcx
+# CHECK-NEXT: 3. 2 2.0 1.0 0.0 popq %rdx
+# CHECK-NEXT: 4. 2 2.5 1.5 0.0 popq %rbx
+# CHECK-NEXT: 5. 2 3.0 2.0 0.0 popq %r12
+# CHECK-NEXT: 2 2.2 1.0 0.5 <total>
diff --git a/llvm/test/tools/llvm-mca/X86/stack-engine-push.s b/llvm/test/tools/llvm-mca/X86/stack-engine-push.s
new file mode 100644
index 0000000000000..fc394d4c1e7d3
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/stack-engine-push.s
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=2 < %s | FileCheck %s
+
+movq $0x80, %rsp
+pushq %rax
+pushq %rcx
+pushq %rdx
+pushq %rbx
+pushq %r12
+
+# CHECK: Iterations: 2
+# CHECK-NEXT: Instructions: 12
+# CHECK-NEXT: Total Cycles: 15
+# CHECK-NEXT: Total uOps: 32
+
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 2.13
+# CHECK-NEXT: IPC: 0.80
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 1 0.25 movq $128, %rsp
+# CHECK-NEXT: 3 2 1.00 * pushq %rax
+# CHECK-NEXT: 3 2 1.00 * pushq %rcx
+# CHECK-NEXT: 3 2 1.00 * pushq %rdx
+# CHECK-NEXT: 3 2 1.00 * pushq %rbx
+# CHECK-NEXT: 3 2 1.00 * pushq %r12
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - SKLDivider
+# CHECK-NEXT: [1] - SKLFPDivider
+# CHECK-NEXT: [2] - SKLPort0
+# CHECK-NEXT: [3] - SKLPort1
+# CHECK-NEXT: [4] - SKLPort2
+# CHECK-NEXT: [5] - SKLPort3
+# CHECK-NEXT: [6] - SKLPort4
+# CHECK-NEXT: [7] - SKLPort5
+# CHECK-NEXT: [8] - SKLPort6
+# CHECK-NEXT: [9] - SKLPort7
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
+# CHECK-NEXT: - - 1.50 1.50 1.50 1.50 5.00 1.50 1.50 2.00
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
+# CHECK-NEXT: - - - - - - - - 1.00 - movq $128, %rsp
+# CHECK-NEXT: - - 0.50 - 0.50 - 1.00 0.50 - 0.50 pushq %rax
+# CHECK-NEXT: - - - 0.50 - 0.50 1.00 - 0.50 0.50 pushq %rcx
+# CHECK-NEXT: - - 0.50 - 0.50 0.50 1.00 0.50 - - pushq %rdx
+# CHECK-NEXT: - - - 0.50 0.50 - 1.00 0.50 - 0.50 pushq %rbx
+# CHECK-NEXT: - - 0.50 0.50 - 0.50 1.00 - - 0.50 pushq %r12
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeER . . . movq $128, %rsp
+# CHECK-NEXT: [0,1] D=eeER . . pushq %rax
+# CHECK-NEXT: [0,2] .D=eeER . . pushq %rcx
+# CHECK-NEXT: [0,3] .D==eeER . . pushq %rdx
+# CHECK-NEXT: [0,4] . D==eeER . . pushq %rbx
+# CHECK-NEXT: [0,5] . D===eeER. . pushq %r12
+# CHECK-NEXT: [1,0] . DeE---R. . movq $128, %rsp
+# CHECK-NEXT: [1,1] . D===eeER . pushq %rax
+# CHECK-NEXT: [1,2] . D===eeER . pushq %rcx
+# CHECK-NEXT: [1,3] . D====eeER . pushq %rdx
+# CHECK-NEXT: [1,4] . D====eeER. pushq %rbx
+# CHECK-NEXT: [1,5] . D=====eeER pushq %r12
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 1.0 1.0 1.5 movq $128, %rsp
+# CHECK-NEXT: 1. 2 3.0 0.5 0.0 pushq %rax
+# CHECK-NEXT: 2. 2 3.0 1.0 0.0 pushq %rcx
+# CHECK-NEXT: 3. 2 4.0 1.0 0.0 pushq %rdx
+# CHECK-NEXT: 4. 2 4.0 1.0 0.0 pushq %rbx
+# CHECK-NEXT: 5. 2 5.0 1.0 0.0 pushq %r12
+# CHECK-NEXT: 2 3.3 0.9 0.3 <total>
More information about the llvm-commits
mailing list