[llvm] Add FeatureFuseLiterals as SubTargetFeature for Grace and Olympus (PR #160257)

Tue Sep 23 02:12:01 PDT 2025

https://github.com/sushgokh created https://github.com/llvm/llvm-project/pull/160257

With this, we are gaining significantly with povray benchmark from SPEC17(around 12% with -flto -Ofast). This is attributable to transformation from this feature and subsequent shrink wrapping.

We also see some improvement(around 2%) with xalanc benchmark from SPEC17.

There are some improvements on some internal benchmarks as well.

Note: All the performance changes are observed on Grace. Things should be same for Olympus as well

>From 0988d3607d86ca8baeb19d3edca1ef61ccb14d85 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Tue, 23 Sep 2025 01:52:29 -0700
Subject: [PATCH] Add FeatureFuseLiterals as SubTargetFeature for Grace and
 Olympus

With this, we are gaining significantly with povray benchmark from
SPEC17(around 12% with -flto -Ofast). This is attributable to
transformation from this feature and subsequent shrink wrapping.

We also see some improvement(around 2%) with xalanc benchmark from
SPEC17.

There are some improvements on some internal benchmarks as well.
---
 llvm/lib/Target/AArch64/AArch64Processors.td        | 6 ++++--
 llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll | 9 ++++++++-
 llvm/test/CodeGen/AArch64/selectopt-const.ll        | 3 ++-
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 81f5d075729d9..1d07e82acae77 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -328,7 +328,8 @@ def TuneOlympus : SubtargetFeature<"olympus", "ARMProcFamily", "Olympus",
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureUseFixedOverScalableIfEqualCost]>;
+                                   FeatureUseFixedOverScalableIfEqualCost,
+                                   FeatureFuseLiterals]>;
 
 // Note that cyclone does not fuse AES instructions, but newer apple chips do
 // perform the fusion and cyclone is used by default when targeting apple OSes.
@@ -641,7 +642,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
                                       FeatureUseFixedOverScalableIfEqualCost,
                                       FeatureAvoidLDAPUR,
                                       FeaturePredictableSelectIsExpensive,
-                                      FeatureDisableLatencySchedHeuristic]>;
+                                      FeatureDisableLatencySchedHeuristic,
+                                      FeatureFuseLiterals]>;
 
 def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
                                       "Neoverse V3 ARM processors", [
diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
index 70b6b91d3cf66..4b77e9eb71faf 100644
--- a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
+++ b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
@@ -12,7 +12,8 @@
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n1     | FileCheck %s
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v1     | FileCheck %s
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n2     | FileCheck %s
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2     | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2     | FileCheck %s --check-prefix FUSE-LITERALS
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=olympus         | FileCheck %s --check-prefix FUSE-LITERALS
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a16 -mattr=-fuse-literals | FileCheck %s
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a17 -mattr=-fuse-literals | FileCheck %s
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=ampere1  -mattr=-fuse-literals | FileCheck %s
@@ -38,6 +39,12 @@ define double @litf() {
 ; CHECK-LABEL: litf:
 ; CHECK:      adrp [[ADDR:x[0-9]+]], [[CSTLABEL:.LCP.*]]
 ; CHECK-NEXT: ldr  {{d[0-9]+}}, {{[[]}}[[ADDR]], :lo12:[[CSTLABEL]]{{[]]}}
+;
+; FUSE-LITERALS: mov     [[R:x[0-9]+]], #11544
+; FUSE-LITERALS: movk    [[R]], #21572, lsl #16
+; FUSE-LITERALS: movk    [[R]], #8699, lsl #32
+; FUSE-LITERALS: movk    [[R]], #16393, lsl #48
+; FUSE-LITERALS: fmov    {{d[0-9]+}}, [[R]]
 entry:
   ret double 0x400921FB54442D18
 }
diff --git a/llvm/test/CodeGen/AArch64/selectopt-const.ll b/llvm/test/CodeGen/AArch64/selectopt-const.ll
index fe48dbaf1ab76..62ac297153962 100644
--- a/llvm/test/CodeGen/AArch64/selectopt-const.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt-const.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -O3 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=olympus -O3 < %s | FileCheck %s
 
 define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
 ; CHECK-LABEL: test_const:
@@ -8,10 +9,10 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
 ; CHECK-NEXT:    b.lt .LBB0_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    mov w9, #1267 // =0x4f3
+; CHECK-NEXT:    movk w9, #16309, lsl #16
 ; CHECK-NEXT:    fmov s1, #1.00000000
 ; CHECK-NEXT:    fmov d2, #5.00000000
 ; CHECK-NEXT:    mov w8, w3
-; CHECK-NEXT:    movk w9, #16309, lsl #16
 ; CHECK-NEXT:    fmov s0, w9
 ; CHECK-NEXT:    mov w9, #16 // =0x10
 ; CHECK-NEXT:    .p2align 5, , 16