[llvm] [AArch64] Reduce fuse-literals limit on Apple subtargets (PR #106741)
Marina Taylor via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 30 07:35:40 PDT 2024
https://github.com/citymarina created https://github.com/llvm/llvm-project/pull/106741
On Apple targets it is faster to load from a constant pool instead of using
a 4-move sequence (mov+movk+movk+movk+fmov).
Potentially it could also be good to do this for 3-move sequences, but we are
less confident about that case and leave it as a possible future refinement.
I am not familiar with the behaviour of other AArch64 processors so I am
making this subtarget-specific.
>From 74667af109fb1f570f6884733ab079bd2206c920 Mon Sep 17 00:00:00 2001
From: Marina Taylor <marina_taylor at apple.com>
Date: Fri, 30 Aug 2024 12:36:50 +0100
Subject: [PATCH 1/4] [AArch64] Fix a presumed typo in isFPImmLegal limit. NFC
The worst possible case for a double literal goes like:
```
mov ...
movk ..., lsl #16
movk ..., lsl #32
movk ..., lsl #48
fmov ...
```
The limit of 5 in the code gives the impression that `Insn` includes
all instructions including the `fmov`, but that's not true. It only
counts the integer moves. This led me astray on some other work in
this area.
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 02390e0a85c0a5..98f6f30112a8c7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11463,7 +11463,8 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
// movw+movk is fused). So we limit up to 2 instrdduction at most.
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn);
- unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
+ assert(Insn.size() <= 4);
+ unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
IsLegal = Insn.size() <= Limit;
}
>From 99de373cf7f9a5caf918e57ba8399bd228686106 Mon Sep 17 00:00:00 2001
From: Marina Taylor <marina_taylor at apple.com>
Date: Fri, 30 Aug 2024 13:02:12 +0100
Subject: [PATCH 2/4] [AArch64] Add tests for fused FP literals. NFC
This is for an upcoming change to the threshold on Apple targets
for using a constant pool for FP literals versus building them with
integer moves.
This file is based on literal_pools_float.ll. I tried to bolt on to
the existing test, but it got messy as that file is already testing
a matrix of combinations, so creating this new file instead.
---
.../AArch64/literal_pools_float_apple.ll | 128 ++++++++++++++++++
1 file changed, 128 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll
diff --git a/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll b/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll
new file mode 100644
index 00000000000000..144f71ab1e4695
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll
@@ -0,0 +1,128 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-macosx -mcpu=apple-m1 < %s | FileCheck %s --check-prefix=APPLE
+
+define dso_local float @float_0mov() {
+; CHECK-LABEL: float_0mov:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, #1.00000000
+; CHECK-NEXT: ret
+;
+; APPLE-LABEL: float_0mov:
+; APPLE: ; %bb.0:
+; APPLE-NEXT: fmov s0, #1.00000000
+; APPLE-NEXT: ret
+ ret float 1.0
+}
+
+define dso_local float @float_1mov() {
+; CHECK-LABEL: float_1mov:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #2143289344 // =0x7fc00000
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ret
+;
+; APPLE-LABEL: float_1mov:
+; APPLE: ; %bb.0:
+; APPLE-NEXT: mov w8, #2143289344 ; =0x7fc00000
+; APPLE-NEXT: fmov s0, w8
+; APPLE-NEXT: ret
+ ret float 0x7FF8000000000000
+}
+
+define dso_local float @float_2mov() {
+; CHECK-LABEL: float_2mov:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #34952 // =0x8888
+; CHECK-NEXT: movk w8, #32704, lsl #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ret
+;
+; APPLE-LABEL: float_2mov:
+; APPLE: ; %bb.0:
+; APPLE-NEXT: mov w8, #34952 ; =0x8888
+; APPLE-NEXT: movk w8, #32704, lsl #16
+; APPLE-NEXT: fmov s0, w8
+; APPLE-NEXT: ret
+ ret float 0x7FF8111100000000
+}
+
+define dso_local double @double_0mov() {
+; CHECK-LABEL: double_0mov:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov d0, #1.00000000
+; CHECK-NEXT: ret
+;
+; APPLE-LABEL: double_0mov:
+; APPLE: ; %bb.0:
+; APPLE-NEXT: fmov d0, #1.00000000
+; APPLE-NEXT: ret
+ ret double 1.0
+}
+
+define dso_local double @double_1mov() {
+; CHECK-LABEL: double_1mov:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #4096 // =0x1000
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+;
+; APPLE-LABEL: double_1mov:
+; APPLE: ; %bb.0:
+; APPLE-NEXT: mov x8, #4096 ; =0x1000
+; APPLE-NEXT: fmov d0, x8
+; APPLE-NEXT: ret
+ ret double 0x1000
+}
+
+define dso_local double @double_2mov() {
+; CHECK-LABEL: double_2mov:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #4096 // =0x1000
+; CHECK-NEXT: movk x8, #8192, lsl #16
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+;
+; APPLE-LABEL: double_2mov:
+; APPLE: ; %bb.0:
+; APPLE-NEXT: mov x8, #4096 ; =0x1000
+; APPLE-NEXT: movk x8, #8192, lsl #16
+; APPLE-NEXT: fmov d0, x8
+; APPLE-NEXT: ret
+ ret double 0x20001000
+}
+
+define dso_local double @double_3mov() {
+; CHECK-LABEL: double_3mov:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI6_0
+; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI6_0]
+; CHECK-NEXT: ret
+;
+; APPLE-LABEL: double_3mov:
+; APPLE: ; %bb.0:
+; APPLE-NEXT: mov x8, #4096 ; =0x1000
+; APPLE-NEXT: movk x8, #8192, lsl #16
+; APPLE-NEXT: movk x8, #12288, lsl #32
+; APPLE-NEXT: fmov d0, x8
+; APPLE-NEXT: ret
+ ret double 0x300020001000
+}
+
+define dso_local double @double_4mov() {
+; CHECK-LABEL: double_4mov:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI7_0
+; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI7_0]
+; CHECK-NEXT: ret
+;
+; APPLE-LABEL: double_4mov:
+; APPLE: ; %bb.0:
+; APPLE-NEXT: mov x8, #4096 ; =0x1000
+; APPLE-NEXT: movk x8, #8192, lsl #16
+; APPLE-NEXT: movk x8, #12288, lsl #32
+; APPLE-NEXT: movk x8, #16384, lsl #48
+; APPLE-NEXT: fmov d0, x8
+; APPLE-NEXT: ret
+ ret double 0x4000300020001000
+}
>From a39c48780f8b6943bb46d8dd366a07a1937ace46 Mon Sep 17 00:00:00 2001
From: Marina Taylor <marina_taylor at apple.com>
Date: Fri, 30 Aug 2024 15:02:56 +0100
Subject: [PATCH 3/4] Add message to assert
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 98f6f30112a8c7..17772e0502f652 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11463,7 +11463,8 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
// movw+movk is fused). So we limit up to 2 instrdduction at most.
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn);
- assert(Insn.size() <= 4);
+ assert(Insn.size() <= 4 &&
+ "Should be able to build any value with at most 4 moves");
unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
IsLegal = Insn.size() <= Limit;
}
>From 6b07b9d59eaece4a301d307cf0fb5b224b9b5cf2 Mon Sep 17 00:00:00 2001
From: Marina Taylor <marina_taylor at apple.com>
Date: Fri, 30 Aug 2024 15:25:13 +0100
Subject: [PATCH 4/4] [AArch64] Reduce fuse-literals limit on Apple subtargets
On Apple targets it is faster to load from a constant pool instead of using
a 4-move sequence (mov+movk+movk+movk+fmov).
Potentially it could also be good to do this for 3-move sequences, but we are
less confident about that case and leave it as a possible future refinement.
I am not familiar with the behaviour of other AArch64 processors so I am
making this subtarget-specific.
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 11 +++++++----
llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 1 +
llvm/lib/Target/AArch64/AArch64Subtarget.h | 3 +++
.../test/CodeGen/AArch64/literal_pools_float_apple.ll | 10 +++++-----
4 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 17772e0502f652..bf3ca8d003cf91 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11458,14 +11458,17 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
// The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
// however the mov+fmov sequence is always better because of the reduced
- // cache pressure. The timings are still the same if you consider
- // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
- // movw+movk is fused). So we limit up to 2 instrdduction at most.
+ // cache pressure. Where targets allow, longer sequences may be possible.
+ // For example, movw+movk+fmov may be comparable to adrp+ldr if the
+ // movw+movk is fused.
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn);
assert(Insn.size() <= 4 &&
"Should be able to build any value with at most 4 moves");
- unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
+ unsigned Limit = (OptForSize ? 1
+ : (Subtarget->hasFuseLiterals()
+ ? Subtarget->getFuseLiteralsLimit()
+ : 2));
IsLegal = Insn.size() <= Limit;
}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 32db1e8c2477a8..ad4d5c23a23678 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -188,6 +188,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
PrefetchDistance = 280;
MinPrefetchStride = 2048;
MaxPrefetchIterationsAhead = 3;
+ FuseLiteralsLimit = 3;
switch (ARMProcFamily) {
case AppleA14:
case AppleA15:
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index accfb49c6fbe3a..23d7ca263eb399 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -70,6 +70,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
unsigned MaxBytesForLoopAlignment = 0;
unsigned MinimumJumpTableEntries = 4;
unsigned MaxJumpTableSize = 0;
+ unsigned FuseLiteralsLimit = 4;
// ReserveXRegister[i] - X#i is not available as a general purpose register.
BitVector ReserveXRegister;
@@ -254,6 +255,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
return MinimumJumpTableEntries;
}
+ unsigned getFuseLiteralsLimit() const { return FuseLiteralsLimit; }
+
/// CPU has TBI (top byte of addresses is ignored during HW address
/// translation) and OS enables it.
bool supportsAddressTopByteIgnored() const;
diff --git a/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll b/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll
index 144f71ab1e4695..8b755d9e35c074 100644
--- a/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll
+++ b/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll
@@ -118,11 +118,11 @@ define dso_local double @double_4mov() {
;
; APPLE-LABEL: double_4mov:
; APPLE: ; %bb.0:
-; APPLE-NEXT: mov x8, #4096 ; =0x1000
-; APPLE-NEXT: movk x8, #8192, lsl #16
-; APPLE-NEXT: movk x8, #12288, lsl #32
-; APPLE-NEXT: movk x8, #16384, lsl #48
-; APPLE-NEXT: fmov d0, x8
+; APPLE-NEXT: Lloh0:
+; APPLE-NEXT: adrp x8, lCPI7_0 at PAGE
+; APPLE-NEXT: Lloh1:
+; APPLE-NEXT: ldr d0, [x8, lCPI7_0 at PAGEOFF]
; APPLE-NEXT: ret
+; APPLE-NEXT: .loh AdrpLdr Lloh0, Lloh1
ret double 0x4000300020001000
}
More information about the llvm-commits
mailing list