[llvm] f83ab2b - [ARM] Improve generation of thumb stack accesses
John Brawn via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 7 09:54:49 PDT 2023
Author: John Brawn
Date: 2023-08-07T17:53:32+01:00
New Revision: f83ab2b3beb0303e0775ea187d9575faa4a048eb
URL: https://github.com/llvm/llvm-project/commit/f83ab2b3beb0303e0775ea187d9575faa4a048eb
DIFF: https://github.com/llvm/llvm-project/commit/f83ab2b3beb0303e0775ea187d9575faa4a048eb.diff
LOG: [ARM] Improve generation of thumb stack accesses
Currently when a stack access is out of range of an sp-relative ldr or
str then we jump straight to generating the offset with a literal pool
load or mov32 pseudo-instruction. This patch improves that in two
ways:
* If the offset is within range of sp-relative add plus an ldr then
use that.
* When we use the mov32 pseudo-instruction, if putting part of the
offset into the ldr will simplify the expansion of the mov32 then
do so.
Differential Revision: https://reviews.llvm.org/D156875
Added:
llvm/test/CodeGen/ARM/execute-only-split-offset.ll
Modified:
llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
llvm/test/CodeGen/ARM/large-stack.ll
llvm/test/CodeGen/Thumb/emergency-spill-slot.ll
llvm/test/CodeGen/Thumb/stack-access.ll
llvm/test/CodeGen/Thumb/stack_guard_remat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
index 0c010ed1eb3423..d8eb6c36c2745a 100644
--- a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -128,6 +128,19 @@ static void emitThumbRegPlusImmInReg(
const ARMBaseRegisterInfo &MRI, unsigned MIFlags = MachineInstr::NoFlags) {
MachineFunction &MF = *MBB.getParent();
const ARMSubtarget &ST = MF.getSubtarget<ARMSubtarget>();
+
+ // Use a single sp-relative add if the immediate is small enough.
+ if (BaseReg == ARM::SP &&
+ (DestReg.isVirtual() || isARMLowRegister(DestReg)) && NumBytes >= 0 &&
+ NumBytes <= 1020 && (NumBytes % 4) == 0) {
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), DestReg)
+ .addReg(ARM::SP)
+ .addImm(NumBytes / 4)
+ .add(predOps(ARMCC::AL))
+ .setMIFlags(MIFlags);
+ return;
+ }
+
bool isHigh = !isARMLowRegister(DestReg) ||
(BaseReg != 0 && !isARMLowRegister(BaseReg));
bool isSub = false;
@@ -422,19 +435,33 @@ bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
return true;
}
+ // The offset doesn't fit, but we may be able to put some of the offset into
+ // the ldr to simplify the generation of the rest of it.
NumBits = 5;
Mask = (1 << NumBits) - 1;
-
- // If this is a thumb spill / restore, we will be using a constpool load to
- // materialize the offset.
- if (Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
- ImmOp.ChangeToImmediate(0);
- } else {
- // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
- ImmedOffset = ImmedOffset & Mask;
- ImmOp.ChangeToImmediate(ImmedOffset);
- Offset &= ~(Mask * Scale);
+ InstrOffs = 0;
+ auto &ST = MF.getSubtarget<ARMSubtarget>();
+ // If using the maximum ldr offset will put the rest into the range of a
+ // single sp-relative add then do so.
+ if (FrameReg == ARM::SP && Offset - (Mask * Scale) <= 1020) {
+ InstrOffs = Mask;
+ } else if (ST.genExecuteOnly()) {
+ // With execute-only the offset is generated either with movw+movt or an
+ // add+lsl sequence. If subtracting an offset will make the top half zero
+ // then that saves a movt or lsl+add. Otherwise if we don't have movw then
+ // we may be able to subtract a value such that it makes the bottom byte
+ // zero, saving an add.
+ unsigned BottomBits = (Offset / Scale) & Mask;
+ bool CanMakeBottomByteZero = ((Offset - BottomBits * Scale) & 0xff) == 0;
+ bool TopHalfZero = (Offset & 0xffff0000) == 0;
+ bool CanMakeTopHalfZero = ((Offset - Mask * Scale) & 0xffff0000) == 0;
+ if (!TopHalfZero && CanMakeTopHalfZero)
+ InstrOffs = Mask;
+ else if (!ST.useMovt() && CanMakeBottomByteZero)
+ InstrOffs = BottomBits;
}
+ ImmOp.ChangeToImmediate(InstrOffs);
+ Offset -= InstrOffs * Scale;
}
return Offset == 0;
diff --git a/llvm/test/CodeGen/ARM/execute-only-split-offset.ll b/llvm/test/CodeGen/ARM/execute-only-split-offset.ll
new file mode 100644
index 00000000000000..c4ee3189dd7723
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/execute-only-split-offset.ll
@@ -0,0 +1,141 @@
+; RUN: llc -mtriple=thumbv8m.base-eabi -mattr=+execute-only %s -o - | FileCheck --check-prefixes=CHECK,CHECK-MOVW %s
+; RUN: llc -mtriple=thumbv6m-eabi -mattr=+execute-only %s -o - | FileCheck --check-prefixes=CHECK,CHECK-NOMOVW %s
+
+; Largest offset that fits into sp-relative ldr
+; CHECK-LABEL: ldr_range_end:
+; CHECK: ldr {{r[0-9]+}}, [sp, #1020]
+define i32 @ldr_range_end() {
+entry:
+ %var = alloca i32, align 4
+ %arr = alloca [1020 x i8], align 4
+ %0 = load i32, ptr %var, align 4
+ ret i32 %0
+}
+
+; Smallest offset that fits into add+ldr
+; CHECK-LABEL: add_ldr_range_start:
+; CHECK: add [[REG:r[0-9]+]], sp, #900
+; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124]
+define i32 @add_ldr_range_start() {
+entry:
+ %var = alloca i32, align 4
+ %arr = alloca [1024 x i8], align 4
+ %0 = load i32, ptr %var, align 4
+ ret i32 %0
+}
+
+; Largest offset that fits into add+ldr
+; CHECK-LABEL: add_ldr_range_end:
+; CHECK: add [[REG:r[0-9]+]], sp, #1020
+; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124]
+define i32 @add_ldr_range_end() {
+entry:
+ %var = alloca i32, align 4
+ %arr = alloca [1144 x i8], align 4
+ %0 = load i32, ptr %var, align 4
+ ret i32 %0
+}
+
+; Smallest offset where we start using mov32. If we don't have movw then using
+; an ldr offset means we save an add.
+; CHECK-LABEL: mov32_range_start:
+; CHECK-MOVW: movw [[REG:r[0-9]+]], #1148
+; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #4
+; CHECK-NOMOVW-NEXT: lsls [[REG]], [[REG]], #8
+; CHECK-NEXT: add [[REG]], sp
+; CHECK-MOVW-NEXT: ldr {{r[0-9]+}}, [[[REG]]]
+; CHECK-NOMOVW-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124]
+define i32 @mov32_range_start() {
+entry:
+ %var = alloca i32, align 4
+ %arr = alloca [1148 x i8], align 4
+ %0 = load i32, ptr %var, align 4
+ ret i32 %0
+}
+
+; Here using an ldr offset doesn't save an add so we shouldn't do it.
+; CHECK-LABEL: mov32_range_next:
+; CHECK-MOVW: movw [[REG:r[0-9]+]], #1152
+; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #4
+; CHECK-NOMOVW-NEXT: lsls [[REG]], [[REG]], #8
+; CHECK-NOMOVW-NEXT: adds [[REG]], #128
+; CHECK-NEXT: add [[REG]], sp
+; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]]]
+define i32 @mov32_range_next() {
+entry:
+ %var = alloca i32, align 4
+ %arr = alloca [1152 x i8], align 4
+ %0 = load i32, ptr %var, align 4
+ ret i32 %0
+}
+
+; Smallest offset where using an ldr offset prevents needing a movt or lsl+add
+; CHECK-LABEL: can_clear_top_byte_start:
+; CHECK: add sp, {{r[0-9]+}}
+; CHECK-MOVW: movw [[REG:r[0-9]+]], #65412
+; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #255
+; CHECK-NOMOVW-NEXT: lsls [[REG:r[0-9]+]], [[REG:r[0-9]+]], #8
+; CHECK-NOMOVW-NEXT: adds [[REG:r[0-9]+]], #132
+; CHECK-NEXT: add [[REG]], sp
+; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124]
+define i32 @can_clear_top_byte_start() {
+entry:
+ %var = alloca i32, align 4
+ %arr = alloca [65536 x i8], align 4
+ %0 = load i32, ptr %var, align 4
+ ret i32 %0
+}
+
+; Largest offset where using an ldr offset prevents needing a movt or lsl+add
+; CHECK-LABEL: can_clear_top_byte_end:
+; CHECK: add sp, {{r[0-9]+}}
+; CHECK-MOVW: movw [[REG:r[0-9]+]], #65532
+; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #255
+; CHECK-NOMOVW-NEXT: lsls [[REG:r[0-9]+]], [[REG:r[0-9]+]], #8
+; CHECK-NOMOVW-NEXT: adds [[REG:r[0-9]+]], #252
+; CHECK-NEXT: add [[REG]], sp
+; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124]
+define i32 @can_clear_top_byte_end() {
+entry:
+ %var = alloca i32, align 4
+ %arr = alloca [65656 x i8], align 4
+ %0 = load i32, ptr %var, align 4
+ ret i32 %0
+}
+
+; Smallest offset where using an ldr offset doesn't clear the top byte, though
+; we can use an ldr offset if not using movt to save an add of the low byte.
+; CHECK-LABEL: cant_clear_top_byte_start:
+; CHECK: add sp, {{r[0-9]+}}
+; CHECK-MOVW: movw [[REG:r[0-9]+]], #124
+; CHECK-MOVW-NEXT: movt [[REG:r[0-9]+]], #1
+; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #1
+; CHECK-NOMOVW-NEXT: lsls [[REG:r[0-9]+]], [[REG:r[0-9]+]], #16
+; CHECK-NEXT: add [[REG]], sp
+; CHECK-MOVW-NEXT: ldr {{r[0-9]+}}, [[[REG]]]
+; CHECK-NOMOVW-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124]
+define i32 @cant_clear_top_byte_start() {
+entry:
+ %var = alloca i32, align 4
+ %arr = alloca [65660 x i8], align 4
+ %0 = load i32, ptr %var, align 4
+ ret i32 %0
+}
+
+; An ldr offset doesn't help for anything, so we shouldn't do it.
+; CHECK-LABEL: cant_clear_top_byte_next:
+; CHECK: add sp, {{r[0-9]+}}
+; CHECK-MOVW: movw [[REG:r[0-9]+]], #128
+; CHECK-MOVW: movt [[REG:r[0-9]+]], #1
+; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #1
+; CHECK-NOMOVW-NEXT: lsls [[REG:r[0-9]+]], [[REG:r[0-9]+]], #16
+; CHECK-NOMOVW-NEXT: adds [[REG:r[0-9]+]], #128
+; CHECK-NEXT: add [[REG]], sp
+; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]]]
+define i32 @cant_clear_top_byte_next() {
+entry:
+ %var = alloca i32, align 4
+ %arr = alloca [65664 x i8], align 4
+ %0 = load i32, ptr %var, align 4
+ ret i32 %0
+}
diff --git a/llvm/test/CodeGen/ARM/large-stack.ll b/llvm/test/CodeGen/ARM/large-stack.ll
index a1adf5d21dc314..e33f0646c5b40a 100644
--- a/llvm/test/CodeGen/ARM/large-stack.ll
+++ b/llvm/test/CodeGen/ARM/large-stack.ll
@@ -44,7 +44,8 @@ define i32 @test3() {
;; are we choosing correct store/tSTRspi pattern for execute-only
; CHECK: movs [[REG:r[0-9]+]], #0x30
; CHECK-NEXT: lsls [[REG]], [[REG]], #0x18
-; CHECK-NEXT: adds [[REG]], #0x8
+; CHECK-NEXT: add [[REG]], sp
+; CHECK-NEXT: str {{r[0-9]+}}, [[[REG]], #0x8]
%tmp1 = load i32, ptr %tmp
ret i32 %tmp1
}
diff --git a/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll b/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll
index 17590347ed1c2c..b20e80b8104548 100644
--- a/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll
+++ b/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll
@@ -176,18 +176,13 @@ define void @arg_emergency_spill(i32 %n, i32 %n2, i32 %n3, i32 %n4, ptr byval([2
; CHECK-NEXT: @APP
; CHECK-NEXT: @NO_APP
; CHECK-NEXT: str r0, [sp]
-; CHECK-NEXT: ldr r0, .LCPI3_0
-; CHECK-NEXT: add r0, sp
-; CHECK-NEXT: str r5, [r0]
+; CHECK-NEXT: add r0, sp, #904
+; CHECK-NEXT: str r5, [r0, #124]
; CHECK-NEXT: ldr r0, [sp]
; CHECK-NEXT: @APP
; CHECK-NEXT: @NO_APP
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
-; CHECK-NEXT: .p2align 2
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI3_0:
-; CHECK-NEXT: .long 1028 @ 0x404
entry:
%asm1 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},={r7},0,1,2,3,4,5,6,7"(ptr %p, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef)
%asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 0
diff --git a/llvm/test/CodeGen/Thumb/stack-access.ll b/llvm/test/CodeGen/Thumb/stack-access.ll
index b9b0e8f0fdf6ff..fd25d49e18fab6 100644
--- a/llvm/test/CodeGen/Thumb/stack-access.ll
+++ b/llvm/test/CodeGen/Thumb/stack-access.ll
@@ -110,19 +110,18 @@ define void @test8() {
store i32 1, ptr %arr2, align 4
; %arr2 is in range, but this element of it is not
-; CHECK-DAG: ldr [[RA:r[0-9]+]], .LCPI7_2
-; CHECK-DAG: add [[RA]], sp
-; CHECK-DAG: str [[REG]], [{{r[0-9]+}}]
+; CHECK-DAG: add [[RA:r[0-9]+]], sp, #900
+; CHECK-DAG: str [[REG]], [{{r[0-9]+}}, #124]
%arr2idx2 = getelementptr inbounds [224 x i32], ptr %arr2, i32 0, i32 32
store i32 1, ptr %arr2idx2, align 4
; %arr3 is not in range
-; CHECK-DAG: ldr [[RB:r[0-9]+]], .LCPI7_3
+; CHECK-DAG: ldr [[RB:r[0-9]+]], .LCPI7_2
; CHECK-DAG: add [[RB]], sp
; CHECK-DAG: str [[REG]], [{{r[0-9]+}}]
store i32 1, ptr %arr3, align 4
-; CHECK-DAG: ldr [[RC:r[0-9]+]], .LCPI7_4
+; CHECK-DAG: ldr [[RC:r[0-9]+]], .LCPI7_3
; CHECK-DAG: add [[RC]], sp
; CHECK-DAG: str [[REG]], [{{r[0-9]+}}]
%arr3idx2 = getelementptr inbounds [224 x i32], ptr %arr3, i32 0, i32 32
diff --git a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
index cfb3e1b2782245..748cc8744d6694 100644
--- a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
@@ -3,9 +3,8 @@
; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s -check-prefix=NO-PIC -check-prefix=DYNAMIC-NO-PIC
;PIC: foo2
-;PIC: ldr [[SAVED_GUARD:r[0-9]+]], [[GUARD_STACK_OFFSET:LCPI[0-9_]+]]
-;PIC-NEXT: add [[SAVED_GUARD]], sp
-;PIC-NEXT: ldr [[SAVED_GUARD]], [[[SAVED_GUARD]]]
+;PIC: add [[SAVED_GUARD:r[0-9]+]], sp, #904
+;PIC-NEXT: ldr [[SAVED_GUARD]], [[[SAVED_GUARD]], #124]
;PIC-NEXT: ldr [[ORIGINAL_GUARD:r[0-9]+]], [[ORIGINAL_GUARD_LABEL:LCPI[0-9_]+]]
;PIC-NEXT: [[LABEL1:LPC[0-9_]+]]:
;PIC-NEXT: add [[ORIGINAL_GUARD]], pc
@@ -13,28 +12,21 @@
;PIC-NEXT: ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
;PIC-NEXT: cmp [[ORIGINAL_GUARD]], [[SAVED_GUARD]]
-;PIC: [[GUARD_STACK_OFFSET]]:
-;PIC-NEXT: .long 1028
;PIC: [[ORIGINAL_GUARD_LABEL]]:
;PIC-NEXT: .long L___stack_chk_guard$non_lazy_ptr-([[LABEL1]]+4)
;NO-PIC: foo2
-;NO-PIC: ldr [[SAVED_GUARD:r[0-9]+]], [[GUARD_STACK_OFFSET:LCPI[0-9_]+]]
-;NO-PIC-NEXT: add [[SAVED_GUARD]], sp
-;NO-PIC-NEXT: ldr [[SAVED_GUARD]], [[[SAVED_GUARD]]]
+;NO-PIC: add [[SAVED_GUARD:r[0-9]+]], sp, #904
+;NO-PIC-NEXT: ldr [[SAVED_GUARD]], [[[SAVED_GUARD]], #124]
;NO-PIC-NEXT: ldr [[ORIGINAL_GUARD:r[0-9]+]], [[ORIGINAL_GUARD_LABEL:LCPI[0-9_]+]]
;NO-PIC-NOT: LPC
;NO-PIC-NEXT: ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
;DYNAMIC-NO-PIC-NEXT: ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
;NO-PIC-NEXT: cmp [[ORIGINAL_GUARD]], [[SAVED_GUARD]]
-;STATIC: [[GUARD_STACK_OFFSET]]:
-;STATIC-NEXT: .long 1028
;STATIC: [[ORIGINAL_GUARD_LABEL]]:
;STATIC-NEXT: .long ___stack_chk_guard
-;DYNAMIC-NO-PIC: [[GUARD_STACK_OFFSET]]:
-;DYNAMIC-NO-PIC-NEXT: .long 1028
;DYNAMIC-NO-PIC: [[ORIGINAL_GUARD_LABEL]]:
;DYNAMIC-NO-PIC-NEXT: .long L___stack_chk_guard$non_lazy_ptr
More information about the llvm-commits
mailing list