[llvm] Add FeatureFuseLiterals as SubTargetFeature for Grace and Olympus (PR #160257)
Sushant Gokhale via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 23 23:10:06 PDT 2025
https://github.com/sushgokh updated https://github.com/llvm/llvm-project/pull/160257
>From 0988d3607d86ca8baeb19d3edca1ef61ccb14d85 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Tue, 23 Sep 2025 01:52:29 -0700
Subject: [PATCH 1/2] Add FeatureFuseLiterals as SubTargetFeature for Grace and
Olympus
With this, we are gaining significantly with povray benchmark from
SPEC17(around 12% with -flto -Ofast). This is attributable to
transformation from this feature and subsequent shrink wrapping.
We also see some improvement(around 2%) with xalanc benchmark from
SPEC17.
There are some improvements on some internal benchmarks as well.
---
llvm/lib/Target/AArch64/AArch64Processors.td | 6 ++++--
llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll | 9 ++++++++-
llvm/test/CodeGen/AArch64/selectopt-const.ll | 3 ++-
3 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 81f5d075729d9..1d07e82acae77 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -328,7 +328,8 @@ def TuneOlympus : SubtargetFeature<"olympus", "ARMProcFamily", "Olympus",
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
- FeatureUseFixedOverScalableIfEqualCost]>;
+ FeatureUseFixedOverScalableIfEqualCost,
+ FeatureFuseLiterals]>;
// Note that cyclone does not fuse AES instructions, but newer apple chips do
// perform the fusion and cyclone is used by default when targeting apple OSes.
@@ -641,7 +642,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
FeatureUseFixedOverScalableIfEqualCost,
FeatureAvoidLDAPUR,
FeaturePredictableSelectIsExpensive,
- FeatureDisableLatencySchedHeuristic]>;
+ FeatureDisableLatencySchedHeuristic,
+ FeatureFuseLiterals]>;
def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
"Neoverse V3 ARM processors", [
diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
index 70b6b91d3cf66..4b77e9eb71faf 100644
--- a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
+++ b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
@@ -12,7 +12,8 @@
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n1 | FileCheck %s
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v1 | FileCheck %s
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n2 | FileCheck %s
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 | FileCheck %s --check-prefix FUSE-LITERALS
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=olympus | FileCheck %s --check-prefix FUSE-LITERALS
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a16 -mattr=-fuse-literals | FileCheck %s
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a17 -mattr=-fuse-literals | FileCheck %s
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=ampere1 -mattr=-fuse-literals | FileCheck %s
@@ -38,6 +39,12 @@ define double @litf() {
; CHECK-LABEL: litf:
; CHECK: adrp [[ADDR:x[0-9]+]], [[CSTLABEL:.LCP.*]]
; CHECK-NEXT: ldr {{d[0-9]+}}, {{[[]}}[[ADDR]], :lo12:[[CSTLABEL]]{{[]]}}
+;
+; FUSE-LITERALS: mov [[R:x[0-9]+]], #11544
+; FUSE-LITERALS: movk [[R]], #21572, lsl #16
+; FUSE-LITERALS: movk [[R]], #8699, lsl #32
+; FUSE-LITERALS: movk [[R]], #16393, lsl #48
+; FUSE-LITERALS: fmov {{d[0-9]+}}, [[R]]
entry:
ret double 0x400921FB54442D18
}
diff --git a/llvm/test/CodeGen/AArch64/selectopt-const.ll b/llvm/test/CodeGen/AArch64/selectopt-const.ll
index fe48dbaf1ab76..62ac297153962 100644
--- a/llvm/test/CodeGen/AArch64/selectopt-const.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt-const.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -O3 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=olympus -O3 < %s | FileCheck %s
define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
; CHECK-LABEL: test_const:
@@ -8,10 +9,10 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
; CHECK-NEXT: b.lt .LBB0_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w9, #1267 // =0x4f3
+; CHECK-NEXT: movk w9, #16309, lsl #16
; CHECK-NEXT: fmov s1, #1.00000000
; CHECK-NEXT: fmov d2, #5.00000000
; CHECK-NEXT: mov w8, w3
-; CHECK-NEXT: movk w9, #16309, lsl #16
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: mov w9, #16 // =0x10
; CHECK-NEXT: .p2align 5, , 16
>From f7781b6aad1af21084ba234b6975fd13b27439fd Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Tue, 23 Sep 2025 23:04:58 -0700
Subject: [PATCH 2/2] Address comments
Rather than adding the new feature, this changes the way how constants
are materialized for Grace and Olympus.
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 11 +++++++++++
llvm/lib/Target/AArch64/AArch64Processors.td | 6 ++----
.../test/CodeGen/AArch64/misched-fusion-addadrp.ll | 14 +++++++-------
llvm/test/CodeGen/AArch64/selectopt-const.ll | 3 +--
4 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cd7f0e719ad0c..d484fccb92d2f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12564,6 +12564,17 @@ bool AArch64TargetLowering::isOffsetFoldingLegal(
bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool OptForSize) const {
+ // If the constant to be materialized is scalar, it maybe efficient to use
+ // sequence of 'mov + fmov' rather than 'adrp + ldr' on specified CPU's.
+ // However, when materializing vector of constants, there are two things to
+ // note:
+ // 1. Throughput of fmov instruction is very low.
+ // 2. ldr instruction can load multiple constants in one go. Also, it's
+ // throughput is higher as compared to fmov.
+ if (!VT.isVector() && (Subtarget->getCPU() == "neoverse-v2" ||
+ Subtarget->getCPU() == "olympus"))
+ return true;
+
bool IsLegal = false;
// We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
// 16-bit case when target has full fp16 support.
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 1d07e82acae77..81f5d075729d9 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -328,8 +328,7 @@ def TuneOlympus : SubtargetFeature<"olympus", "ARMProcFamily", "Olympus",
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
- FeatureUseFixedOverScalableIfEqualCost,
- FeatureFuseLiterals]>;
+ FeatureUseFixedOverScalableIfEqualCost]>;
// Note that cyclone does not fuse AES instructions, but newer apple chips do
// perform the fusion and cyclone is used by default when targeting apple OSes.
@@ -642,8 +641,7 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
FeatureUseFixedOverScalableIfEqualCost,
FeatureAvoidLDAPUR,
FeaturePredictableSelectIsExpensive,
- FeatureDisableLatencySchedHeuristic,
- FeatureFuseLiterals]>;
+ FeatureDisableLatencySchedHeuristic]>;
def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
"Neoverse V3 ARM processors", [
diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
index 4b77e9eb71faf..a30665cbbbc2a 100644
--- a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
+++ b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
@@ -12,8 +12,8 @@
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n1 | FileCheck %s
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v1 | FileCheck %s
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n2 | FileCheck %s
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 | FileCheck %s --check-prefix FUSE-LITERALS
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=olympus | FileCheck %s --check-prefix FUSE-LITERALS
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 | FileCheck %s --check-prefix NO-CONST-POOL
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=olympus | FileCheck %s --check-prefix NO-CONST-POOL
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a16 -mattr=-fuse-literals | FileCheck %s
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a17 -mattr=-fuse-literals | FileCheck %s
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=ampere1 -mattr=-fuse-literals | FileCheck %s
@@ -40,11 +40,11 @@ define double @litf() {
; CHECK: adrp [[ADDR:x[0-9]+]], [[CSTLABEL:.LCP.*]]
; CHECK-NEXT: ldr {{d[0-9]+}}, {{[[]}}[[ADDR]], :lo12:[[CSTLABEL]]{{[]]}}
;
-; FUSE-LITERALS: mov [[R:x[0-9]+]], #11544
-; FUSE-LITERALS: movk [[R]], #21572, lsl #16
-; FUSE-LITERALS: movk [[R]], #8699, lsl #32
-; FUSE-LITERALS: movk [[R]], #16393, lsl #48
-; FUSE-LITERALS: fmov {{d[0-9]+}}, [[R]]
+; NO-CONST-POOL: mov [[R:x[0-9]+]], #11544
+; NO-CONST-POOL: movk [[R]], #21572, lsl #16
+; NO-CONST-POOL: movk [[R]], #8699, lsl #32
+; NO-CONST-POOL: movk [[R]], #16393, lsl #48
+; NO-CONST-POOL: fmov {{d[0-9]+}}, [[R]]
entry:
ret double 0x400921FB54442D18
}
diff --git a/llvm/test/CodeGen/AArch64/selectopt-const.ll b/llvm/test/CodeGen/AArch64/selectopt-const.ll
index 62ac297153962..fe48dbaf1ab76 100644
--- a/llvm/test/CodeGen/AArch64/selectopt-const.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt-const.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -O3 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=olympus -O3 < %s | FileCheck %s
define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
; CHECK-LABEL: test_const:
@@ -9,10 +8,10 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
; CHECK-NEXT: b.lt .LBB0_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w9, #1267 // =0x4f3
-; CHECK-NEXT: movk w9, #16309, lsl #16
; CHECK-NEXT: fmov s1, #1.00000000
; CHECK-NEXT: fmov d2, #5.00000000
; CHECK-NEXT: mov w8, w3
+; CHECK-NEXT: movk w9, #16309, lsl #16
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: mov w9, #16 // =0x10
; CHECK-NEXT: .p2align 5, , 16
More information about the llvm-commits
mailing list