[llvm] [X86][APX] Prevent from emitting push2/pop2 if stack alignment<16B (PR #143076)
Feng Zou via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 6 00:02:50 PDT 2025
https://github.com/fzou1 updated https://github.com/llvm/llvm-project/pull/143076
>From 4c71a9845349753b8e8454452c04a4a9b5331424 Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou at intel.com>
Date: Fri, 6 Jun 2025 12:48:50 +0800
Subject: [PATCH 1/2] [X86][APX] Prevent from emitting push2/pop2 when stack
alignment < 16 bytes
push2/pop2 requires 16 bytes stack alignment. If the stack alignment is
less than that, push2/pop2 should not be emitted.
---
llvm/lib/Target/X86/X86FrameLowering.cpp | 2 +-
...op2-disabled-with-small-stack-alignment.ll | 209 ++++++++++++++++++
2 files changed, 210 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 75f49beee27c6..09b036b5f0c77 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2921,7 +2921,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
// 3. When the number of CSR push is even, start to use push2 from the 1st
// push and make the stack 16B aligned before the push
unsigned NumRegsForPush2 = 0;
- if (STI.hasPush2Pop2()) {
+ if (STI.hasPush2Pop2() && getStackAlignment() >= 16) {
unsigned NumCSGPR = llvm::count_if(CSI, [](const CalleeSavedInfo &I) {
return X86::GR64RegClass.contains(I.getReg());
});
diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll b/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll
new file mode 100644
index 0000000000000..d72c0bbcbcc7d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2 | FileCheck %s --check-prefix=CHECK
+
+; This test is used to check no push2/pop2 emitted if stack alignment is set to
+; the value less than 16 bytes required by push2/pop2 instruction. Here it's set
+; to 8 bytes.
+
+define void @csr1() nounwind {
+; CHECK-LABEL: csr1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr2() nounwind {
+; CHECK-LABEL: csr2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr3() nounwind {
+; CHECK-LABEL: csr3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr4() nounwind {
+; CHECK-LABEL: csr4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr5() nounwind {
+; CHECK-LABEL: csr5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr6() nounwind {
+; CHECK-LABEL: csr6:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{rbx},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+
+define void @lea_in_epilog(i1 %arg, ptr %arg1, ptr %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10) nounwind {
+; CHECK-LABEL: lea_in_epilog:
+; CHECK: # %bb.0: # %bb
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: je .LBB6_5
+; CHECK-NEXT: # %bb.1: # %bb13
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $16, %rsp
+; CHECK-NEXT: movq %r9, %r14
+; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r14
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; CHECK-NEXT: addq %r14, %r13
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; CHECK-NEXT: addq %r14, %r15
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; CHECK-NEXT: addq %r14, %rbx
+; CHECK-NEXT: xorl %ebp, %ebp
+; CHECK-NEXT: xorl %r12d, %r12d
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: .LBB6_2: # %bb15
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: incq %r12
+; CHECK-NEXT: movl $432, %edx # imm = 0x1B0
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: movq %r15, %rsi
+; CHECK-NEXT: callq memcpy at PLT
+; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: addq %rax, %r13
+; CHECK-NEXT: addq %rax, %r15
+; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r14
+; CHECK-NEXT: addq $8, %rbp
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: je .LBB6_2
+; CHECK-NEXT: # %bb.3: # %bb11
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: jne .LBB6_5
+; CHECK-NEXT: # %bb.4: # %bb12
+; CHECK-NEXT: movq $0, (%rax)
+; CHECK-NEXT: .LBB6_5: # %bb14
+; CHECK-NEXT: retq
+bb:
+ br i1 %arg, label %bb13, label %bb14
+
+bb11:
+ br i1 %arg, label %bb14, label %bb12
+
+bb12:
+ store double 0.000000e+00, ptr %arg1, align 8
+ br label %bb14
+
+bb13:
+ %getelementptr = getelementptr i8, ptr null, i64 %arg5
+ br label %bb15
+
+bb14:
+ ret void
+
+bb15:
+ %phi = phi i64 [ 0, %bb13 ], [ %add, %bb15 ]
+ %getelementptr16 = getelementptr double, ptr null, i64 %phi
+ %add = add i64 %phi, 1
+ %mul = mul i64 %arg6, %add
+ %getelementptr17 = getelementptr i8, ptr %getelementptr, i64 %mul
+ call void @llvm.memcpy.p0.p0.i64(ptr %getelementptr16, ptr %getelementptr17, i64 0, i1 false)
+ %getelementptr18 = getelementptr i8, ptr %getelementptr17, i64 %arg7
+ %getelementptr19 = getelementptr i8, ptr %getelementptr17, i64 %arg8
+ call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr19, i64 0, i1 false)
+ %getelementptr20 = getelementptr i8, ptr %getelementptr17, i64 %arg9
+ call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr20, i64 432, i1 false)
+ %getelementptr21 = getelementptr i8, ptr %getelementptr17, i64 %arg10
+ call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr21, i64 0, i1 false)
+ br i1 %arg, label %bb11, label %bb15
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"override-stack-alignment", i32 8}
>From 0f8e848ccf3bd1712dc8bf43ab5cd482ca0baccc Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou at intel.com>
Date: Fri, 6 Jun 2025 15:01:47 +0800
Subject: [PATCH 2/2] Update comments and remove the complicated sub test.
---
llvm/lib/Target/X86/X86FrameLowering.cpp | 1 +
llvm/lib/Target/X86/X86MachineFunctionInfo.h | 2 +-
...op2-disabled-with-small-stack-alignment.ll | 92 -------------------
3 files changed, 2 insertions(+), 93 deletions(-)
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 09b036b5f0c77..42a09106b3a3a 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2915,6 +2915,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
// 1. Use push2 when
// a) number of CSR > 1 if no need padding
// b) number of CSR > 2 if need padding
+ // c) stack alignment >= 16 bytes
// 2. When the number of CSR push is odd
// a. Start to use push2 from the 1st push if stack is 16B aligned.
// b. Start to use push2 from the 2nd push if stack is not 16B aligned.
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 24371369d4a45..5f974e5de9a19 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -149,7 +149,7 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// other tools to detect the extended record.
bool HasSwiftAsyncContext = false;
- /// Ajust stack for push2/pop2
+ /// Adjust stack for push2/pop2
bool PadForPush2Pop2 = false;
/// Candidate registers for push2/pop2
diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll b/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll
index d72c0bbcbcc7d..f07e40b442c04 100644
--- a/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll
+++ b/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll
@@ -113,97 +113,5 @@ entry:
ret void
}
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
-
-define void @lea_in_epilog(i1 %arg, ptr %arg1, ptr %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10) nounwind {
-; CHECK-LABEL: lea_in_epilog:
-; CHECK: # %bb.0: # %bb
-; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: je .LBB6_5
-; CHECK-NEXT: # %bb.1: # %bb13
-; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: pushq %r15
-; CHECK-NEXT: pushq %r14
-; CHECK-NEXT: pushq %r13
-; CHECK-NEXT: pushq %r12
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: subq $16, %rsp
-; CHECK-NEXT: movq %r9, %r14
-; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r14
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; CHECK-NEXT: addq %r14, %r13
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; CHECK-NEXT: addq %r14, %r15
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; CHECK-NEXT: addq %r14, %rbx
-; CHECK-NEXT: xorl %ebp, %ebp
-; CHECK-NEXT: xorl %r12d, %r12d
-; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: .LBB6_2: # %bb15
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: incq %r12
-; CHECK-NEXT: movl $432, %edx # imm = 0x1B0
-; CHECK-NEXT: xorl %edi, %edi
-; CHECK-NEXT: movq %r15, %rsi
-; CHECK-NEXT: callq memcpy at PLT
-; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT: addq %rax, %r13
-; CHECK-NEXT: addq %rax, %r15
-; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: addq %rax, %r14
-; CHECK-NEXT: addq $8, %rbp
-; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: je .LBB6_2
-; CHECK-NEXT: # %bb.3: # %bb11
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsp
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: popq %r12
-; CHECK-NEXT: popq %r13
-; CHECK-NEXT: popq %r14
-; CHECK-NEXT: popq %r15
-; CHECK-NEXT: popq %rbp
-; CHECK-NEXT: jne .LBB6_5
-; CHECK-NEXT: # %bb.4: # %bb12
-; CHECK-NEXT: movq $0, (%rax)
-; CHECK-NEXT: .LBB6_5: # %bb14
-; CHECK-NEXT: retq
-bb:
- br i1 %arg, label %bb13, label %bb14
-
-bb11:
- br i1 %arg, label %bb14, label %bb12
-
-bb12:
- store double 0.000000e+00, ptr %arg1, align 8
- br label %bb14
-
-bb13:
- %getelementptr = getelementptr i8, ptr null, i64 %arg5
- br label %bb15
-
-bb14:
- ret void
-
-bb15:
- %phi = phi i64 [ 0, %bb13 ], [ %add, %bb15 ]
- %getelementptr16 = getelementptr double, ptr null, i64 %phi
- %add = add i64 %phi, 1
- %mul = mul i64 %arg6, %add
- %getelementptr17 = getelementptr i8, ptr %getelementptr, i64 %mul
- call void @llvm.memcpy.p0.p0.i64(ptr %getelementptr16, ptr %getelementptr17, i64 0, i1 false)
- %getelementptr18 = getelementptr i8, ptr %getelementptr17, i64 %arg7
- %getelementptr19 = getelementptr i8, ptr %getelementptr17, i64 %arg8
- call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr19, i64 0, i1 false)
- %getelementptr20 = getelementptr i8, ptr %getelementptr17, i64 %arg9
- call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr20, i64 432, i1 false)
- %getelementptr21 = getelementptr i8, ptr %getelementptr17, i64 %arg10
- call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr21, i64 0, i1 false)
- br i1 %arg, label %bb11, label %bb15
-}
-
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"override-stack-alignment", i32 8}
More information about the llvm-commits
mailing list