[llvm] Add FeatureFuseLiterals as SubTargetFeature for Grace and Olympus (PR #160257)
Sushant Gokhale via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 23 02:12:01 PDT 2025
https://github.com/sushgokh created https://github.com/llvm/llvm-project/pull/160257
With this, we are gaining significantly with povray benchmark from SPEC17(around 12% with -flto -Ofast). This is attributable to transformation from this feature and subsequent shrink wrapping.
We also see some improvement(around 2%) with xalanc benchmark from SPEC17.
There are some improvements on some internal benchmarks as well.
Note: All the performance changes are observed on Grace. Things should be same for Olympus as well
>From 0988d3607d86ca8baeb19d3edca1ef61ccb14d85 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Tue, 23 Sep 2025 01:52:29 -0700
Subject: [PATCH] Add FeatureFuseLiterals as SubTargetFeature for Grace and
Olympus
With this, we are gaining significantly with povray benchmark from
SPEC17(around 12% with -flto -Ofast). This is attributable to
transformation from this feature and subsequent shrink wrapping.
We also see some improvement(around 2%) with xalanc benchmark from
SPEC17.
There are some improvements on some internal benchmarks as well.
---
llvm/lib/Target/AArch64/AArch64Processors.td | 6 ++++--
llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll | 9 ++++++++-
llvm/test/CodeGen/AArch64/selectopt-const.ll | 3 ++-
3 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 81f5d075729d9..1d07e82acae77 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -328,7 +328,8 @@ def TuneOlympus : SubtargetFeature<"olympus", "ARMProcFamily", "Olympus",
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
- FeatureUseFixedOverScalableIfEqualCost]>;
+ FeatureUseFixedOverScalableIfEqualCost,
+ FeatureFuseLiterals]>;
// Note that cyclone does not fuse AES instructions, but newer apple chips do
// perform the fusion and cyclone is used by default when targeting apple OSes.
@@ -641,7 +642,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
FeatureUseFixedOverScalableIfEqualCost,
FeatureAvoidLDAPUR,
FeaturePredictableSelectIsExpensive,
- FeatureDisableLatencySchedHeuristic]>;
+ FeatureDisableLatencySchedHeuristic,
+ FeatureFuseLiterals]>;
def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
"Neoverse V3 ARM processors", [
diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
index 70b6b91d3cf66..4b77e9eb71faf 100644
--- a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
+++ b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
@@ -12,7 +12,8 @@
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n1 | FileCheck %s
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v1 | FileCheck %s
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n2 | FileCheck %s
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 | FileCheck %s --check-prefix FUSE-LITERALS
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=olympus | FileCheck %s --check-prefix FUSE-LITERALS
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a16 -mattr=-fuse-literals | FileCheck %s
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a17 -mattr=-fuse-literals | FileCheck %s
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=ampere1 -mattr=-fuse-literals | FileCheck %s
@@ -38,6 +39,12 @@ define double @litf() {
; CHECK-LABEL: litf:
; CHECK: adrp [[ADDR:x[0-9]+]], [[CSTLABEL:.LCP.*]]
; CHECK-NEXT: ldr {{d[0-9]+}}, {{[[]}}[[ADDR]], :lo12:[[CSTLABEL]]{{[]]}}
+;
+; FUSE-LITERALS: mov [[R:x[0-9]+]], #11544
+; FUSE-LITERALS: movk [[R]], #21572, lsl #16
+; FUSE-LITERALS: movk [[R]], #8699, lsl #32
+; FUSE-LITERALS: movk [[R]], #16393, lsl #48
+; FUSE-LITERALS: fmov {{d[0-9]+}}, [[R]]
entry:
ret double 0x400921FB54442D18
}
diff --git a/llvm/test/CodeGen/AArch64/selectopt-const.ll b/llvm/test/CodeGen/AArch64/selectopt-const.ll
index fe48dbaf1ab76..62ac297153962 100644
--- a/llvm/test/CodeGen/AArch64/selectopt-const.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt-const.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -O3 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=olympus -O3 < %s | FileCheck %s
define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
; CHECK-LABEL: test_const:
@@ -8,10 +9,10 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
; CHECK-NEXT: b.lt .LBB0_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w9, #1267 // =0x4f3
+; CHECK-NEXT: movk w9, #16309, lsl #16
; CHECK-NEXT: fmov s1, #1.00000000
; CHECK-NEXT: fmov d2, #5.00000000
; CHECK-NEXT: mov w8, w3
-; CHECK-NEXT: movk w9, #16309, lsl #16
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: mov w9, #16 // =0x10
; CHECK-NEXT: .p2align 5, , 16
More information about the llvm-commits
mailing list