[llvm] 83cdb65 - [AArch64][Fix] LdSt optimization generate premature stack-popping
Diogo Sampaio via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 13 19:03:17 PDT 2020
Author: Diogo Sampaio
Date: 2020-03-14T02:03:10Z
New Revision: 83cdb654e4759ad00aba27f25615c4a0c7c2e62e
URL: https://github.com/llvm/llvm-project/commit/83cdb654e4759ad00aba27f25615c4a0c7c2e62e
DIFF: https://github.com/llvm/llvm-project/commit/83cdb654e4759ad00aba27f25615c4a0c7c2e62e.diff
LOG: [AArch64][Fix] LdSt optimization generate premature stack-popping
Summary:
When moving add and sub to memory operand instructions,
aarch64-ldst-opt would prematurally pop the stack pointer,
before memory instructions that do access the stack using
indirect loads.
e.g.
```
int foo(int offset){
int local[4] = {0};
return local[offset];
}
```
would generate:
```
sub sp, sp, #16 ; Push the stack
mov x8, sp ; Save stack in register
stp xzr, xzr, [sp], #16 ; Zero initialize stack, and post-increment, making it invalid
------ If an exception goes here, the stack value might be corrupted
ldr w0, [x8, w0, sxtw #2] ; Access correct position, but it is not guarded by SP
```
Reviewers: fhahn, foad, thegameg, eli.friedman, efriedma
Reviewed By: efriedma
Subscribers: efriedma, kristof.beyls, hiraditya, danielkiss, llvm-commits, simon_tatham
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75755
Added:
llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir
Modified:
llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
llvm/test/CodeGen/AArch64/arm64-nvcast.ll
llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index c403860622da..acd71bce0152 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -29,6 +29,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -1780,6 +1781,21 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
ModifiedRegUnits.clear();
UsedRegUnits.clear();
++MBBI;
+
+ // We can't post-increment the stack pointer if any instruction between
+ // the memory access (I) and the increment (MBBI) can access the memory
+ // region defined by [SP, MBBI].
+ const bool BaseRegSP = BaseReg == AArch64::SP;
+ if (BaseRegSP) {
+ // FIXME: For now, we always block the optimization over SP in windows
+ // targets as it requires to adjust the unwind/debug info, messing up
+ // the unwind info can actually cause a miscompile.
+ const MCAsmInfo *MAI = I->getMF()->getTarget().getMCAsmInfo();
+ if (MAI->usesWindowsCFI() &&
+ I->getMF()->getFunction().needsUnwindTableEntry())
+ return E;
+ }
+
for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
MachineInstr &MI = *MBBI;
@@ -1797,8 +1813,11 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
// Otherwise, if the base register is used or modified, we have no match, so
// return early.
+ // If we are optimizing SP, do not allow instructions that may load or store
+ // in between the load and the optimized value update.
if (!ModifiedRegUnits.available(BaseReg) ||
- !UsedRegUnits.available(BaseReg))
+ !UsedRegUnits.available(BaseReg) ||
+ (BaseRegSP && MBBI->mayLoadOrStore()))
return E;
}
return E;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir b/llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir
new file mode 100644
index 000000000000..44b39ae8e626
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir
@@ -0,0 +1,85 @@
+# RUN: llc -start-before=aarch64-ldst-opt -o - %s | FileCheck %s
+# CHECK-NOT: stp xzr, xzr, [sp], #16
+# CHECK: add sp, sp, #16
+--- |
+ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+ target triple = "aarch64-arm-none-eabi"
+
+ define hidden i32 @foo(i32 %0) {
+ %2 = alloca [4 x i32], align 4
+ %3 = bitcast [4 x i32]* %2 to i8*
+ call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(16) %3, i8 0, i64 16, i1 false)
+ %4 = sext i32 %0 to i64
+ %5 = getelementptr inbounds [4 x i32], [4 x i32]* %2, i64 0, i64 %4
+ %6 = load i32, i32* %5, align 4
+ ret i32 %6
+ }
+
+ declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #2
+ declare void @llvm.stackprotector(i8*, i8**) #3
+
+ !llvm.module.flags = !{!0}
+ !llvm.ident = !{!1}
+
+ !0 = !{i32 1, !"wchar_size", i32 4}
+ !1 = !{!"clang version 11.0.0 "}
+ !2 = !{!3, !3, i64 0}
+ !3 = !{!"int", !4, i64 0}
+ !4 = !{!"omnipotent char", !5, i64 0}
+ !5 = !{!"Simple C++ TBAA"}
+
+...
+---
+name: foo
+alignment: 8
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+registers: []
+liveins:
+ - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 16
+ offsetAdjustment: 0
+ maxAlignment: 8
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ localFrameSize: 16
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, type: default, offset: -16, size: 16,
+ alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ local-offset: -16, debug-info-variable: '', debug-info-expression: '',
+ debug-info-location: '' }
+callSites: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ bb.0 (%ir-block.1):
+ liveins: $w0
+
+ $sp = frame-setup SUBXri $sp, 16, 0
+ $x8 = ADDXri $sp, 0, 0
+ STRXui $xzr, $sp, 1 :: (store 8 into %ir.3 + 8)
+ STRXui $xzr, $sp, 0 :: (store 8 into %ir.3)
+ renamable $w0 = LDRWroW killed renamable $x8, killed renamable $w0, 1, 1 :: (load 4 from %ir.5, !tbaa !2)
+ $sp = frame-destroy ADDXri $sp, 16, 0
+ RET undef $lr, implicit $w0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll
index 59b956c7d90c..ebaef0cbd4c2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll
@@ -1,31 +1,41 @@
; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
-; CHECK-LABEL: _test:
-; CHECK-DAG: fmov.2d v0, #2.00000000
-; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3
-; CHECK-DAG: mov x9, sp
-; CHECK-DAG: str q0, [sp], #16
-; CHECK-DAG: bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2
-; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}}
-; CHECK: str s0, [x0]
-
define void @test(float * %p1, i32 %v1) {
+; CHECK-LABEL: test:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: sub sp, sp, #16 ; =16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: fmov.2d v0, #2.00000000
+; CHECK-NEXT: and x8, x1, #0x3
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: bfi x9, x8, #2, #2
+; CHECK-NEXT: ldr s0, [x9]
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: add sp, sp, #16 ; =16
+; CHECK-NEXT: ret
entry:
%v2 = extractelement <3 x float> <float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, i32 %v1
store float %v2, float* %p1, align 4
ret void
}
-; CHECK-LABEL: _test2
-; CHECK: movi.16b v0, #63
-; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3
-; CHECK-DAG: str q0, [sp], #16
-; CHECK-DAG: mov x9, sp
-; CHECK-DAG: bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2
-; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}}
-; CHECK: str s0, [x0]
-
define void @test2(float * %p1, i32 %v1) {
+; CHECK-LABEL: test2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: sub sp, sp, #16 ; =16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: movi.16b v0, #63
+; CHECK-NEXT: and x8, x1, #0x3
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: bfi x9, x8, #2, #2
+; CHECK-NEXT: ldr s0, [x9]
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: add sp, sp, #16 ; =16
+; CHECK-NEXT: ret
entry:
%v2 = extractelement <3 x float> <float 0.7470588088035583, float 0.7470588088035583, float 0.7470588088035583>, i32 %v1
store float %v2, float* %p1, align 4
diff --git a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
index 3902bbc9fca5..2a0793aa851d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
@@ -24,10 +24,12 @@ entry:
%struct.S2 = type { i32, i32, i32, i32 }
define dso_local [2 x i64] @"?f2"() {
entry:
+; FIXME: Missed optimization, the entire SP push/pop could be removed
; CHECK-LABEL: f2
-; CHECK: stp xzr, xzr, [sp], #16
-; CHECK: mov x0, xzr
-; CHECK: mov x1, xzr
+; CHECK: stp xzr, xzr, [sp, #-16]!
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: mov x1, xzr
+; CHECK-NEXT: add sp, sp, #16
%retval = alloca %struct.S2, align 4
%a = getelementptr inbounds %struct.S2, %struct.S2* %retval, i32 0, i32 0
More information about the llvm-commits
mailing list