[llvm] [AArch64] Turn MaxInterleaveFactor into a subtarget feature (PR #171088)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 8 00:13:15 PST 2025
https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/171088
The default value for MaxInterleaveFactor is 2, but some CPUs prefer a wider factor of 4. This adds a subtarget feature so that cpus can override the default in their tuning features, keeping more of the options together in one place.
>From e67e073a8fb3f28699f7d8c7f092140f662154cf Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Fri, 5 Dec 2025 11:43:54 +0000
Subject: [PATCH] [AArch64] Turn MaxInterleaveFactor into a subtarget feature
The default value for MaxInterleaveFactor is 2, but some CPUs prefer a wider
factor of 4. This adds a subtarget feature so that cpus can override the
default in their tuning features, keeping more of the options together in one
place.
---
llvm/lib/Target/AArch64/AArch64Features.td | 4 ++
llvm/lib/Target/AArch64/AArch64Processors.td | 63 +++++++++++++-------
llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 24 --------
3 files changed, 46 insertions(+), 45 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 066724bea92c9..b6275329c8672 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -915,6 +915,10 @@ def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move",
"UseWzrToVecMove", "true",
"Move from WZR to insert 0 into vector registers">;
+def FeatureMaxInterleaveFactor4 : SubtargetFeature<
+ "max-interleave-factor-4", "MaxInterleaveFactor", "4",
+ "Set the MaxInterleaveFactor to 4 (from the default 2)">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 120415f91c9ae..31990c6d4e222 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -74,7 +74,8 @@ def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
FeatureAddrLSLSlow14,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
"Cortex-A65 ARM processors", [
@@ -306,7 +307,8 @@ def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
FeatureAggressiveFMA,
FeatureArithmeticBccFusion,
FeatureStorePairSuppress,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneMONAKA : SubtargetFeature<"fujitsu-monaka", "ARMProcFamily", "MONAKA",
"Fujitsu FUJITSU-MONAKA processors", [
@@ -328,7 +330,8 @@ def TuneOlympus : SubtargetFeature<"olympus", "ARMProcFamily", "Olympus",
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
- FeatureUseFixedOverScalableIfEqualCost]>;
+ FeatureUseFixedOverScalableIfEqualCost,
+ FeatureMaxInterleaveFactor4]>;
// Note that cyclone does not fuse AES instructions, but newer apple chips do
// perform the fusion and cyclone is used by default when targeting apple OSes.
@@ -431,7 +434,8 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
"Apple A15", [
@@ -452,7 +456,8 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
"Apple A16", [
@@ -473,7 +478,8 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
"Apple A17", [
@@ -494,7 +500,8 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
"Apple M4", [
@@ -514,7 +521,8 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
@@ -528,7 +536,8 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
FeatureStorePairSuppress,
FeatureALULSLFast,
FeaturePostRAScheduler,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
// Re-uses some scheduling and tunings from the ExynosM3 proc family.
def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
@@ -556,7 +565,8 @@ def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureALULSLFast,
- FeatureStorePairSuppress]>;
+ FeatureStorePairSuppress,
+ FeatureMaxInterleaveFactor4]>;
def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
"Qualcomm Falkor processors", [
@@ -566,7 +576,8 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
FeatureZCZeroingGPR64,
FeatureStorePairSuppress,
FeatureALULSLFast,
- FeatureSlowSTRQro]>;
+ FeatureSlowSTRQro,
+ FeatureMaxInterleaveFactor4]>;
def TuneNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", "NeoverseE1",
"Neoverse E1 ARM processors", [
@@ -614,7 +625,8 @@ def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Ne
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1",
"Neoverse V1 ARM processors", [
@@ -643,7 +655,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
FeatureUseFixedOverScalableIfEqualCost,
FeatureAvoidLDAPUR,
FeaturePredictableSelectIsExpensive,
- FeatureDisableLatencySchedHeuristic]>;
+ FeatureDisableLatencySchedHeuristic,
+ FeatureMaxInterleaveFactor4]>;
def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
"Neoverse V3 ARM processors", [
@@ -655,7 +668,8 @@ def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeatureAvoidLDAPUR,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "NeoverseV3",
"Neoverse V3AE ARM processors", [
@@ -676,7 +690,8 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureStorePairSuppress,
- FeatureALULSLFast]>;
+ FeatureALULSLFast,
+ FeatureMaxInterleaveFactor4]>;
def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99",
"Cavium ThunderX2 processors", [
@@ -684,7 +699,8 @@ def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "Thund
FeatureArithmeticBccFusion,
FeaturePostRAScheduler,
FeatureStorePairSuppress,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
"ThunderX3T110",
@@ -695,7 +711,8 @@ def TuneThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
FeaturePredictableSelectIsExpensive,
FeatureBalanceFPOps,
FeatureStorePairSuppress,
- FeatureStrictAlign]>;
+ FeatureStrictAlign,
+ FeatureMaxInterleaveFactor4]>;
def TuneThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
"Cavium ThunderX processors", [
@@ -743,7 +760,8 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureLdpAlignedOnly,
- FeatureStpAlignedOnly]>;
+ FeatureStpAlignedOnly,
+ FeatureMaxInterleaveFactor4]>;
def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
"Ampere Computing Ampere-1A processors", [
@@ -759,7 +777,8 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
FeatureFuseAddSub2RegAndConstOne,
FeatureStorePairSuppress,
FeatureLdpAlignedOnly,
- FeatureStpAlignedOnly]>;
+ FeatureStpAlignedOnly,
+ FeatureMaxInterleaveFactor4]>;
def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
"Ampere Computing Ampere-1B processors", [
@@ -776,7 +795,8 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive,
FeatureLdpAlignedOnly,
- FeatureStpAlignedOnly]>;
+ FeatureStpAlignedOnly,
+ FeatureMaxInterleaveFactor4]>;
def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily",
"Oryon",
@@ -799,7 +819,8 @@ def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily",
FeaturePerfMon,
FeatureSPE,
FeaturePostRAScheduler,
- HasV8_6aOps]>;
+ HasV8_6aOps,
+ FeatureMaxInterleaveFactor4]>;
def ProcessorFeatures {
list<SubtargetFeature> A320 = [HasV9_2aOps, FeatureNEON, FeatureMTE,
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 53b00e83a36b3..df5dab31df9f9 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -149,7 +149,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxBytesForLoopAlignment = 8;
break;
case CortexA57:
- MaxInterleaveFactor = 4;
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(16);
MaxBytesForLoopAlignment = 8;
@@ -199,7 +198,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 256;
PrefFunctionAlignment = Align(8);
PrefLoopAlignment = Align(4);
- MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
@@ -222,26 +220,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
PrefetchDistance = 280;
MinPrefetchStride = 2048;
MaxPrefetchIterationsAhead = 3;
- switch (ARMProcFamily) {
- case AppleA14:
- case AppleA15:
- case AppleA16:
- case AppleA17:
- case AppleM4:
- MaxInterleaveFactor = 4;
- break;
- default:
- break;
- }
break;
case ExynosM3:
- MaxInterleaveFactor = 4;
MaxJumpTableSize = 20;
PrefFunctionAlignment = Align(32);
PrefLoopAlignment = Align(16);
break;
case Falkor:
- MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
CacheLineSize = 128;
@@ -250,7 +235,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxPrefetchIterationsAhead = 8;
break;
case Kryo:
- MaxInterleaveFactor = 4;
VectorInsertExtractBaseCost = 2;
CacheLineSize = 128;
PrefetchDistance = 740;
@@ -271,7 +255,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
case NeoverseV3:
CacheLineSize = 64;
EpilogueVectorizationMinVF = 8;
- MaxInterleaveFactor = 4;
ScatterOverhead = 13;
[[fallthrough]];
case NeoverseN2:
@@ -291,10 +274,8 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
case Neoverse512TVB:
PrefFunctionAlignment = Align(16);
VScaleForTuning = 1;
- MaxInterleaveFactor = 4;
break;
case Saphira:
- MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
break;
@@ -302,7 +283,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 64;
PrefFunctionAlignment = Align(8);
PrefLoopAlignment = Align(4);
- MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
@@ -328,7 +308,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 64;
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(4);
- MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
@@ -341,18 +320,15 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 64;
PrefFunctionAlignment = Align(64);
PrefLoopAlignment = Align(64);
- MaxInterleaveFactor = 4;
break;
case Oryon:
CacheLineSize = 64;
PrefFunctionAlignment = Align(16);
- MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
break;
case Olympus:
EpilogueVectorizationMinVF = 8;
- MaxInterleaveFactor = 4;
ScatterOverhead = 13;
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(32);
More information about the llvm-commits
mailing list