[llvm] [AArch64]Enable aggressive interleaving for A320 (PR #169825)
Nashe Mncube via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 10 02:23:10 PST 2025
https://github.com/nasherm updated https://github.com/llvm/llvm-project/pull/169825
>From 613bfd6381a01b9c951080d1a4c1be1f64ea2c76 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Thu, 27 Nov 2025 15:50:15 +0000
Subject: [PATCH 1/5] [AArch64]Enabled aggressive interleaving for A320
This patch makes use of aggressive interleaving options
for the A320 subtarget. This is done by adding a new local
parameter to the AArch64Subtarget class. With this flag
enabled we see an aggregate uplift of 0.7% on internal benchmark suites
with up to 51% uplift on individual benchmark workloads.
Change-Id: I53367df0aef299e0d02ce9150a105f12fa575b9d
---
llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 6 ++++++
llvm/lib/Target/AArch64/AArch64Subtarget.h | 4 ++++
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h | 2 ++
3 files changed, 12 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index dae4f6a82e3aa..4a63b11f665c6 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -175,6 +175,12 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxBytesForLoopAlignment = 16;
break;
case CortexA320:
+ AggressiveInterleaving = true;
+ PrefFunctionAlignment = Align(16);
+ VScaleForTuning = 1;
+ PrefLoopAlignment = Align(16);
+ MaxBytesForLoopAlignment = 8;
+ break;
case CortexA510:
case CortexA520:
PrefFunctionAlignment = Align(16);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 8553f16a6c937..d0c482cc403a0 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -93,6 +93,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool EnableSubregLiveness;
+ bool AggressiveInterleaving = false;
+
/// TargetTriple - What processor and OS we're targeting.
Triple TargetTriple;
@@ -484,6 +486,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
/// a function.
std::optional<uint16_t>
getPtrAuthBlockAddressDiscriminatorIfEnabled(const Function &ParentFn) const;
+
+ bool enableAggressiveInterleaving() const { return AggressiveInterleaving; }
};
} // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index ecefe2a7f1380..adcac6a409c26 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -537,6 +537,8 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
bool isProfitableToSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const override;
+
+ bool enableAggressiveInterleaving(bool) const override { return ST->enableAggressiveInterleaving(); }
/// @}
};
>From 7a5e60be1ae6e65fcb92d43f8503c537b57d4799 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Thu, 27 Nov 2025 17:00:13 +0000
Subject: [PATCH 2/5] Code formatting
Change-Id: I6de3eec3da77cf211b060ed9541f1caee7fe0606
---
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index adcac6a409c26..c9bf44b15144a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -538,7 +538,9 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
bool isProfitableToSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const override;
- bool enableAggressiveInterleaving(bool) const override { return ST->enableAggressiveInterleaving(); }
+ bool enableAggressiveInterleaving(bool) const override {
+ return ST->enableAggressiveInterleaving();
+ }
/// @}
};
>From e339183005d52801c899d1e30bebe6972df4db21 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Mon, 1 Dec 2025 16:23:04 +0000
Subject: [PATCH 3/5] Review comments
- added test for aggresive interleaving codegen on A320
- added SubtargetFeature for AggressiveInterleaving
Change-Id: Idd0c33346223dbe53d3f540d2f38a4028d6eb2e8
---
llvm/lib/Target/AArch64/AArch64Features.td | 4 +
llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 6 +-
llvm/lib/Target/AArch64/AArch64Subtarget.h | 2 -
.../AArch64/aggressive-interleaving.ll | 328 ++++++++++++++++++
4 files changed, 333 insertions(+), 7 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/aggressive-interleaving.ll
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index f1baaf82195f9..226f406e4ff5d 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -919,6 +919,10 @@ def FeatureDisableUnpredicatedLdStLower : SubtargetFeature<
"disable-unpredicated-ld-st-lower", "DisableUnpredicatedLdStLower",
"true", "Disable lowering unpredicated loads/stores as LDR/STR">;
+def FeatureAggressiveInterleaving : SubtargetFeature<"aggressive-interleaving",
+ "AggressiveInterleaving", "false",
+ "Make use of aggressive interleaving during vectorization">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 4a63b11f665c6..13b5ca5e080b2 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -176,11 +176,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
break;
case CortexA320:
AggressiveInterleaving = true;
- PrefFunctionAlignment = Align(16);
- VScaleForTuning = 1;
- PrefLoopAlignment = Align(16);
- MaxBytesForLoopAlignment = 8;
- break;
+ [[fallthrough]];
case CortexA510:
case CortexA520:
PrefFunctionAlignment = Align(16);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index d0c482cc403a0..bfd696a449859 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -93,8 +93,6 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool EnableSubregLiveness;
- bool AggressiveInterleaving = false;
-
/// TargetTriple - What processor and OS we're targeting.
Triple TargetTriple;
diff --git a/llvm/test/CodeGen/AArch64/aggressive-interleaving.ll b/llvm/test/CodeGen/AArch64/aggressive-interleaving.ll
new file mode 100644
index 0000000000000..cd4ce3c8d0138
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aggressive-interleaving.ll
@@ -0,0 +1,328 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=loop-vectorize -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a320 2>&1 | FileCheck %s --check-prefix=A320
+
+; The loop below is a small counted loop with a scalar reduction.
+; When AggressiveInterleaving is enabled for this subtarget, LoopVectorize
+; should choose an interleave count > 1 (VF == 1), which manifests as
+; multiple loads / multiplies / adds in the vector body.
+
+
+define void @test_interleave_reduction(i32*** %arg, double** %arg1) {
+; A320-LABEL: define void @test_interleave_reduction(
+; A320-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
+; A320-NEXT: [[ENTRY:.*:]]
+; A320-NEXT: [[TPM15:%.*]] = load ptr, ptr [[ARG]], align 8
+; A320-NEXT: [[TPM19:%.*]] = load ptr, ptr [[ARG1]], align 8
+; A320-NEXT: br label %[[OUTER:.*]]
+; A320: [[OUTER]]:
+; A320-NEXT: [[TPM26:%.*]] = add i64 0, 1
+; A320-NEXT: [[TPM10:%.*]] = alloca i32, align 8
+; A320-NEXT: [[TPM102:%.*]] = ptrtoint ptr [[TPM10]] to i64
+; A320-NEXT: [[TPM27:%.*]] = getelementptr inbounds i32, ptr [[TPM10]], i64 [[TPM26]]
+; A320-NEXT: [[TPM28:%.*]] = getelementptr inbounds ptr, ptr [[TPM15]], i64 0
+; A320-NEXT: [[TPM29:%.*]] = load ptr, ptr [[TPM28]], align 8
+; A320-NEXT: [[TPM291:%.*]] = ptrtoint ptr [[TPM29]] to i64
+; A320-NEXT: [[TPM17:%.*]] = alloca double, align 8
+; A320-NEXT: [[TPM32:%.*]] = getelementptr inbounds double, ptr [[TPM17]], i64 [[TPM26]]
+; A320-NEXT: [[TMP0:%.*]] = add i64 [[TPM291]], -8
+; A320-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[TPM102]]
+; A320-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
+; A320-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; A320-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2
+; A320-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; A320: [[VECTOR_PH]]:
+; A320-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
+; A320-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; A320-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 4
+; A320-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[TPM27]], i64 [[TMP4]]
+; A320-NEXT: [[TMP5:%.*]] = mul i64 [[N_VEC]], 8
+; A320-NEXT: [[IND_END3:%.*]] = getelementptr i8, ptr [[TPM32]], i64 [[TMP5]]
+; A320-NEXT: br label %[[VECTOR_BODY:.*]]
+; A320: [[VECTOR_BODY]]:
+; A320-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[VEC_PHI5:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; A320-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; A320-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 4
+; A320-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[TPM27]], i64 [[TMP6]]
+; A320-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[TPM27]], i64 [[TMP7]]
+; A320-NEXT: [[OFFSET_IDX7:%.*]] = mul i64 [[INDEX]], 8
+; A320-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX7]], 0
+; A320-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX7]], 8
+; A320-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TPM32]], i64 [[TMP8]]
+; A320-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[TPM32]], i64 [[TMP9]]
+; A320-NEXT: [[TMP10:%.*]] = load double, ptr [[NEXT_GEP8]], align 8
+; A320-NEXT: [[TMP11:%.*]] = load double, ptr [[NEXT_GEP9]], align 8
+; A320-NEXT: [[TMP12:%.*]] = load i32, ptr [[NEXT_GEP]], align 4
+; A320-NEXT: [[TMP13:%.*]] = load i32, ptr [[NEXT_GEP6]], align 4
+; A320-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
+; A320-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
+; A320-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[TPM19]], i64 [[TMP14]]
+; A320-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[TPM19]], i64 [[TMP15]]
+; A320-NEXT: [[TMP18:%.*]] = load double, ptr [[TMP16]], align 8
+; A320-NEXT: [[TMP19:%.*]] = load double, ptr [[TMP17]], align 8
+; A320-NEXT: [[TMP20:%.*]] = fmul fast double [[TMP18]], [[TMP10]]
+; A320-NEXT: [[TMP21:%.*]] = fmul fast double [[TMP19]], [[TMP11]]
+; A320-NEXT: [[TMP22]] = fadd fast double [[TMP20]], [[VEC_PHI]]
+; A320-NEXT: [[TMP23]] = fadd fast double [[TMP21]], [[VEC_PHI5]]
+; A320-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; A320-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; A320-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; A320: [[MIDDLE_BLOCK]]:
+; A320-NEXT: [[BIN_RDX:%.*]] = fadd fast double [[TMP23]], [[TMP22]]
+; A320-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; A320-NEXT: br i1 [[CMP_N]], label %[[EXIT_INNER:.*]], label %[[SCALAR_PH]]
+; A320: [[SCALAR_PH]]:
+; A320-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[TPM27]], %[[OUTER]] ]
+; A320-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], %[[MIDDLE_BLOCK]] ], [ [[TPM32]], %[[OUTER]] ]
+; A320-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[OUTER]] ]
+; A320-NEXT: br label %[[INNER:.*]]
+; A320: [[INNER]]:
+; A320-NEXT: [[PHI_PTR_I32:%.*]] = phi ptr [ [[NEXT_I32:%.*]], %[[INNER]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; A320-NEXT: [[PHI_PTR_F64:%.*]] = phi ptr [ [[NEXT_F64:%.*]], %[[INNER]] ], [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ]
+; A320-NEXT: [[PHI_ACC:%.*]] = phi double [ [[TPM50:%.*]], %[[INNER]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; A320-NEXT: [[TPM44:%.*]] = load double, ptr [[PHI_PTR_F64]], align 8
+; A320-NEXT: [[TPM45:%.*]] = load i32, ptr [[PHI_PTR_I32]], align 4
+; A320-NEXT: [[TPM46:%.*]] = zext i32 [[TPM45]] to i64
+; A320-NEXT: [[TPM47:%.*]] = getelementptr inbounds double, ptr [[TPM19]], i64 [[TPM46]]
+; A320-NEXT: [[TPM48:%.*]] = load double, ptr [[TPM47]], align 8
+; A320-NEXT: [[TPM49:%.*]] = fmul fast double [[TPM48]], [[TPM44]]
+; A320-NEXT: [[TPM50]] = fadd fast double [[TPM49]], [[PHI_ACC]]
+; A320-NEXT: [[NEXT_I32]] = getelementptr inbounds i32, ptr [[PHI_PTR_I32]], i64 1
+; A320-NEXT: [[NEXT_F64]] = getelementptr inbounds double, ptr [[PHI_PTR_F64]], i64 1
+; A320-NEXT: [[DONE:%.*]] = icmp eq ptr [[NEXT_I32]], [[TPM29]]
+; A320-NEXT: br i1 [[DONE]], label %[[EXIT_INNER]], label %[[INNER]], !llvm.loop [[LOOP3:![0-9]+]]
+; A320: [[EXIT_INNER]]:
+; A320-NEXT: [[TPM50_LCSSA:%.*]] = phi double [ [[TPM50]], %[[INNER]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
+; A320-NEXT: [[TPM35:%.*]] = getelementptr inbounds double, ptr [[TPM19]], i64 0
+; A320-NEXT: [[TPM37:%.*]] = fsub fast double 0.000000e+00, [[TPM50_LCSSA]]
+; A320-NEXT: store double [[TPM37]], ptr [[TPM35]], align 8
+; A320-NEXT: br label %[[OUTER]]
+;
+entry:
+ %tpm15 = load i32**, i32*** %arg, align 8
+ %tpm19 = load double*, double** %arg1, align 8
+ br label %outer
+
+outer: ; preds = %inner, %entry
+ %tpm26 = add i64 0, 1
+ %tpm10 = alloca i32, align 8
+ %tpm27 = getelementptr inbounds i32, ptr %tpm10, i64 %tpm26
+ %tpm28 = getelementptr inbounds i32*, ptr %tpm15, i64 0
+ %tpm29 = load i32*, ptr %tpm28, align 8
+ %tpm17 = alloca double, align 8
+ %tpm32 = getelementptr inbounds double, ptr %tpm17, i64 %tpm26
+ br label %inner
+
+inner: ; preds = %inner, %outer
+ %phi.ptr.i32 = phi ptr [ %next.i32, %inner ], [ %tpm27, %outer ]
+ %phi.ptr.f64 = phi ptr [ %next.f64, %inner ], [ %tpm32, %outer ]
+ %phi.acc = phi double [ %tpm50, %inner ], [ 0.0, %outer ]
+
+ %tpm44 = load double, ptr %phi.ptr.f64, align 8
+ %tpm45 = load i32, ptr %phi.ptr.i32, align 4
+ %tpm46 = zext i32 %tpm45 to i64
+ %tpm47 = getelementptr inbounds double, ptr %tpm19, i64 %tpm46
+ %tpm48 = load double, ptr %tpm47, align 8
+ %tpm49 = fmul fast double %tpm48, %tpm44
+ %tpm50 = fadd fast double %tpm49, %phi.acc
+
+ %next.i32 = getelementptr inbounds i32, ptr %phi.ptr.i32, i64 1
+ %next.f64 = getelementptr inbounds double, ptr %phi.ptr.f64, i64 1
+ %done = icmp eq ptr %next.i32, %tpm29
+ br i1 %done, label %exit.inner, label %inner
+
+exit.inner: ; preds = %inner
+ %tpm35 = getelementptr inbounds double, ptr %tpm19, i64 0
+ %tpm37 = fsub fast double 0.0, %tpm50
+ store double %tpm37, ptr %tpm35, align 8
+ br label %outer
+}
+
+;===---------------------------------------------------------------------===;
+; 1) Simple sum-reduction over one array
+; Expect: VF = 1 with interleave count > 1, so vector.body contains
+; duplicated loads and adds.
+;===---------------------------------------------------------------------===;
+
+define double @sum_reduction(double* nocapture readonly %a, i64 %n) {
+; A320-LABEL: define double @sum_reduction(
+; A320-SAME: ptr nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; A320-NEXT: [[ENTRY:.*]]:
+; A320-NEXT: [[CMP0:%.*]] = icmp eq i64 [[N]], 0
+; A320-NEXT: br i1 [[CMP0]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
+; A320: [[LOOP_PREHEADER]]:
+; A320-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; A320-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; A320: [[VECTOR_PH]]:
+; A320-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; A320-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; A320-NEXT: br label %[[VECTOR_BODY:.*]]
+; A320: [[VECTOR_BODY]]:
+; A320-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[VEC_PHI1:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; A320-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; A320-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP0]]
+; A320-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP1]]
+; A320-NEXT: [[TMP4:%.*]] = load double, ptr [[TMP2]], align 8
+; A320-NEXT: [[TMP5:%.*]] = load double, ptr [[TMP3]], align 8
+; A320-NEXT: [[TMP6]] = fadd fast double [[VEC_PHI]], [[TMP4]]
+; A320-NEXT: [[TMP7]] = fadd fast double [[VEC_PHI1]], [[TMP5]]
+; A320-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; A320-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; A320-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; A320: [[MIDDLE_BLOCK]]:
+; A320-NEXT: [[BIN_RDX:%.*]] = fadd fast double [[TMP7]], [[TMP6]]
+; A320-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; A320-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; A320: [[SCALAR_PH]]:
+; A320-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; A320-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[LOOP_PREHEADER]] ]
+; A320-NEXT: br label %[[LOOP:.*]]
+; A320: [[LOOP]]:
+; A320-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; A320-NEXT: [[SUM:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], %[[LOOP]] ]
+; A320-NEXT: [[GEP:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
+; A320-NEXT: [[VAL:%.*]] = load double, ptr [[GEP]], align 8
+; A320-NEXT: [[SUM_NEXT]] = fadd fast double [[SUM]], [[VAL]]
+; A320-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; A320-NEXT: [[COND:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
+; A320-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; A320: [[EXIT_LOOPEXIT]]:
+; A320-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi double [ [[SUM_NEXT]], %[[LOOP]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
+; A320-NEXT: br label %[[EXIT]]
+; A320: [[EXIT]]:
+; A320-NEXT: [[RES:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_NEXT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; A320-NEXT: ret double [[RES]]
+;
+entry:
+ %cmp0 = icmp eq i64 %n, 0
+ br i1 %cmp0, label %exit, label %loop.preheader
+
+loop.preheader:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+ %sum = phi double [ 0.0, %loop.preheader ], [ %sum.next, %loop ]
+
+ %gep = getelementptr inbounds double, ptr %a, i64 %iv
+ %val = load double, ptr %gep, align 8
+ %sum.next = fadd fast double %sum, %val
+
+ %iv.next = add nuw nsw i64 %iv, 1
+ %cond = icmp ult i64 %iv.next, %n
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ %res = phi double [ 0.0, %entry ], [ %sum.next, %loop ]
+ ret double %res
+}
+
+;===---------------------------------------------------------------------===;
+; 2) Dot-product of two arrays
+; Expect: again, VF = 1 with interleave count > 1. The vector body should
+; have multiple pairs of loads and fmuls/fadds.
+;===---------------------------------------------------------------------===;
+
+define double @dot_product(double* nocapture readonly %a, double* nocapture readonly %b, i64 %n) {
+; A320-LABEL: define double @dot_product(
+; A320-SAME: ptr nocapture readonly [[A:%.*]], ptr nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; A320-NEXT: [[ENTRY:.*]]:
+; A320-NEXT: [[CMP0:%.*]] = icmp eq i64 [[N]], 0
+; A320-NEXT: br i1 [[CMP0]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
+; A320: [[LOOP_PREHEADER]]:
+; A320-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; A320-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; A320: [[VECTOR_PH]]:
+; A320-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; A320-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; A320-NEXT: br label %[[VECTOR_BODY:.*]]
+; A320: [[VECTOR_BODY]]:
+; A320-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[VEC_PHI1:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; A320-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; A320-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP0]]
+; A320-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP1]]
+; A320-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP0]]
+; A320-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP1]]
+; A320-NEXT: [[TMP6:%.*]] = load double, ptr [[TMP2]], align 8
+; A320-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP3]], align 8
+; A320-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP4]], align 8
+; A320-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP5]], align 8
+; A320-NEXT: [[TMP10:%.*]] = fmul fast double [[TMP6]], [[TMP8]]
+; A320-NEXT: [[TMP11:%.*]] = fmul fast double [[TMP7]], [[TMP9]]
+; A320-NEXT: [[TMP12]] = fadd fast double [[VEC_PHI]], [[TMP10]]
+; A320-NEXT: [[TMP13]] = fadd fast double [[VEC_PHI1]], [[TMP11]]
+; A320-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; A320-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; A320-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; A320: [[MIDDLE_BLOCK]]:
+; A320-NEXT: [[BIN_RDX:%.*]] = fadd fast double [[TMP13]], [[TMP12]]
+; A320-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; A320-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; A320: [[SCALAR_PH]]:
+; A320-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; A320-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[LOOP_PREHEADER]] ]
+; A320-NEXT: br label %[[LOOP:.*]]
+; A320: [[LOOP]]:
+; A320-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; A320-NEXT: [[ACC:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ACC_NEXT:%.*]], %[[LOOP]] ]
+; A320-NEXT: [[GEP_A:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
+; A320-NEXT: [[GEP_B:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[IV]]
+; A320-NEXT: [[VA:%.*]] = load double, ptr [[GEP_A]], align 8
+; A320-NEXT: [[VB:%.*]] = load double, ptr [[GEP_B]], align 8
+; A320-NEXT: [[PROD:%.*]] = fmul fast double [[VA]], [[VB]]
+; A320-NEXT: [[ACC_NEXT]] = fadd fast double [[ACC]], [[PROD]]
+; A320-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; A320-NEXT: [[COND:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
+; A320-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; A320: [[EXIT_LOOPEXIT]]:
+; A320-NEXT: [[ACC_NEXT_LCSSA:%.*]] = phi double [ [[ACC_NEXT]], %[[LOOP]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
+; A320-NEXT: br label %[[EXIT]]
+; A320: [[EXIT]]:
+; A320-NEXT: [[RES:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[ACC_NEXT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; A320-NEXT: ret double [[RES]]
+;
+entry:
+ %cmp0 = icmp eq i64 %n, 0
+ br i1 %cmp0, label %exit, label %loop.preheader
+
+loop.preheader:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+ %acc = phi double [ 0.0, %loop.preheader ], [ %acc.next, %loop ]
+
+ %gep.a = getelementptr inbounds double, ptr %a, i64 %iv
+ %gep.b = getelementptr inbounds double, ptr %b, i64 %iv
+ %va = load double, ptr %gep.a, align 8
+ %vb = load double, ptr %gep.b, align 8
+
+ %prod = fmul fast double %va, %vb
+ %acc.next = fadd fast double %acc, %prod
+
+ %iv.next = add nuw nsw i64 %iv, 1
+ %cond = icmp ult i64 %iv.next, %n
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ %res = phi double [ 0.0, %entry ], [ %acc.next, %loop ]
+ ret double %res
+}
+;.
+; A320: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; A320: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; A320: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; A320: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; A320: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; A320: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; A320: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; A320: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+;.
>From 6fedc7b4b96c8c3f44641f38e41e1548e7686388 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Tue, 9 Dec 2025 17:05:39 +0000
Subject: [PATCH 4/5] Update test
Change-Id: Icad4c882f22cd1bbd787c5eedcee27ef74f31b7c
---
.../AArch64/aggressive-interleaving.ll | 90 +++++++++----------
1 file changed, 43 insertions(+), 47 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/aggressive-interleaving.ll b/llvm/test/CodeGen/AArch64/aggressive-interleaving.ll
index cd4ce3c8d0138..4951a5e9ef070 100644
--- a/llvm/test/CodeGen/AArch64/aggressive-interleaving.ll
+++ b/llvm/test/CodeGen/AArch64/aggressive-interleaving.ll
@@ -43,14 +43,12 @@ define void @test_interleave_reduction(i32*** %arg, double** %arg1) {
; A320-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[VEC_PHI5:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
-; A320-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
; A320-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 4
-; A320-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[TPM27]], i64 [[TMP6]]
+; A320-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[TPM27]], i64 [[OFFSET_IDX]]
; A320-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[TPM27]], i64 [[TMP7]]
; A320-NEXT: [[OFFSET_IDX7:%.*]] = mul i64 [[INDEX]], 8
-; A320-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX7]], 0
; A320-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX7]], 8
-; A320-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TPM32]], i64 [[TMP8]]
+; A320-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TPM32]], i64 [[OFFSET_IDX7]]
; A320-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[TPM32]], i64 [[TMP9]]
; A320-NEXT: [[TMP10:%.*]] = load double, ptr [[NEXT_GEP8]], align 8
; A320-NEXT: [[TMP11:%.*]] = load double, ptr [[NEXT_GEP9]], align 8
@@ -75,12 +73,12 @@ define void @test_interleave_reduction(i32*** %arg, double** %arg1) {
; A320-NEXT: br i1 [[CMP_N]], label %[[EXIT_INNER:.*]], label %[[SCALAR_PH]]
; A320: [[SCALAR_PH]]:
; A320-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[TPM27]], %[[OUTER]] ]
-; A320-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], %[[MIDDLE_BLOCK]] ], [ [[TPM32]], %[[OUTER]] ]
+; A320-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[IND_END3]], %[[MIDDLE_BLOCK]] ], [ [[TPM32]], %[[OUTER]] ]
; A320-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[OUTER]] ]
; A320-NEXT: br label %[[INNER:.*]]
; A320: [[INNER]]:
; A320-NEXT: [[PHI_PTR_I32:%.*]] = phi ptr [ [[NEXT_I32:%.*]], %[[INNER]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; A320-NEXT: [[PHI_PTR_F64:%.*]] = phi ptr [ [[NEXT_F64:%.*]], %[[INNER]] ], [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ]
+; A320-NEXT: [[PHI_PTR_F64:%.*]] = phi ptr [ [[NEXT_F64:%.*]], %[[INNER]] ], [ [[BC_RESUME_VAL8]], %[[SCALAR_PH]] ]
; A320-NEXT: [[PHI_ACC:%.*]] = phi double [ [[TPM50:%.*]], %[[INNER]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; A320-NEXT: [[TPM44:%.*]] = load double, ptr [[PHI_PTR_F64]], align 8
; A320-NEXT: [[TPM45:%.*]] = load i32, ptr [[PHI_PTR_I32]], align 4
@@ -148,39 +146,38 @@ exit.inner: ; preds = %inner
define double @sum_reduction(double* nocapture readonly %a, i64 %n) {
; A320-LABEL: define double @sum_reduction(
-; A320-SAME: ptr nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; A320-SAME: ptr readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; A320-NEXT: [[ENTRY:.*]]:
; A320-NEXT: [[CMP0:%.*]] = icmp eq i64 [[N]], 0
; A320-NEXT: br i1 [[CMP0]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
; A320: [[LOOP_PREHEADER]]:
-; A320-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; A320-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
; A320-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; A320: [[VECTOR_PH]]:
-; A320-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; A320-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
; A320-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; A320-NEXT: br label %[[VECTOR_BODY:.*]]
; A320: [[VECTOR_BODY]]:
-; A320-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; A320-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
-; A320-NEXT: [[VEC_PHI1:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
-; A320-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; A320-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; A320-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP0]]
+; A320-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP1]]
-; A320-NEXT: [[TMP4:%.*]] = load double, ptr [[TMP2]], align 8
-; A320-NEXT: [[TMP5:%.*]] = load double, ptr [[TMP3]], align 8
-; A320-NEXT: [[TMP6]] = fadd fast double [[VEC_PHI]], [[TMP4]]
-; A320-NEXT: [[TMP7]] = fadd fast double [[VEC_PHI1]], [[TMP5]]
-; A320-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; A320-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i64 2
+; A320-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; A320-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP6]], align 8
+; A320-NEXT: [[TMP2]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
+; A320-NEXT: [[TMP4]] = fadd fast <2 x double> [[VEC_PHI1]], [[WIDE_LOAD2]]
+; A320-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP1]], 4
; A320-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; A320-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; A320: [[MIDDLE_BLOCK]]:
-; A320-NEXT: [[BIN_RDX:%.*]] = fadd fast double [[TMP7]], [[TMP6]]
+; A320-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP2]]
+; A320-NEXT: [[TMP5:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[BIN_RDX]])
; A320-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; A320-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; A320: [[SCALAR_PH]]:
; A320-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
-; A320-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[LOOP_PREHEADER]] ]
+; A320-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[LOOP_PREHEADER]] ]
; A320-NEXT: br label %[[LOOP:.*]]
; A320: [[LOOP]]:
; A320-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -192,7 +189,7 @@ define double @sum_reduction(double* nocapture readonly %a, i64 %n) {
; A320-NEXT: [[COND:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
; A320-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
; A320: [[EXIT_LOOPEXIT]]:
-; A320-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi double [ [[SUM_NEXT]], %[[LOOP]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
+; A320-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi double [ [[SUM_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
; A320-NEXT: br label %[[EXIT]]
; A320: [[EXIT]]:
; A320-NEXT: [[RES:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_NEXT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
@@ -230,45 +227,44 @@ exit:
define double @dot_product(double* nocapture readonly %a, double* nocapture readonly %b, i64 %n) {
; A320-LABEL: define double @dot_product(
-; A320-SAME: ptr nocapture readonly [[A:%.*]], ptr nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; A320-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; A320-NEXT: [[ENTRY:.*]]:
; A320-NEXT: [[CMP0:%.*]] = icmp eq i64 [[N]], 0
; A320-NEXT: br i1 [[CMP0]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
; A320: [[LOOP_PREHEADER]]:
-; A320-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; A320-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
; A320-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; A320: [[VECTOR_PH]]:
-; A320-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; A320-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
; A320-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; A320-NEXT: br label %[[VECTOR_BODY:.*]]
; A320: [[VECTOR_BODY]]:
-; A320-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; A320-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
-; A320-NEXT: [[VEC_PHI1:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
-; A320-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; A320-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; A320-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP0]]
+; A320-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; A320-NEXT: [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP1]]
-; A320-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP0]]
; A320-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP1]]
-; A320-NEXT: [[TMP6:%.*]] = load double, ptr [[TMP2]], align 8
-; A320-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP3]], align 8
-; A320-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP4]], align 8
-; A320-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP5]], align 8
-; A320-NEXT: [[TMP10:%.*]] = fmul fast double [[TMP6]], [[TMP8]]
-; A320-NEXT: [[TMP11:%.*]] = fmul fast double [[TMP7]], [[TMP9]]
-; A320-NEXT: [[TMP12]] = fadd fast double [[VEC_PHI]], [[TMP10]]
-; A320-NEXT: [[TMP13]] = fadd fast double [[VEC_PHI1]], [[TMP11]]
-; A320-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; A320-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i64 2
+; A320-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; A320-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
+; A320-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i64 2
+; A320-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
+; A320-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x double>, ptr [[TMP8]], align 8
+; A320-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; A320-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> [[WIDE_LOAD2]], [[WIDE_LOAD4]]
+; A320-NEXT: [[TMP6]] = fadd fast <2 x double> [[VEC_PHI]], [[TMP4]]
+; A320-NEXT: [[TMP7]] = fadd fast <2 x double> [[VEC_PHI1]], [[TMP10]]
+; A320-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP1]], 4
; A320-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; A320-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; A320: [[MIDDLE_BLOCK]]:
-; A320-NEXT: [[BIN_RDX:%.*]] = fadd fast double [[TMP13]], [[TMP12]]
+; A320-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP6]]
+; A320-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[BIN_RDX]])
; A320-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; A320-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; A320: [[SCALAR_PH]]:
; A320-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
-; A320-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[LOOP_PREHEADER]] ]
+; A320-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[LOOP_PREHEADER]] ]
; A320-NEXT: br label %[[LOOP:.*]]
; A320: [[LOOP]]:
; A320-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -283,7 +279,7 @@ define double @dot_product(double* nocapture readonly %a, double* nocapture read
; A320-NEXT: [[COND:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
; A320-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
; A320: [[EXIT_LOOPEXIT]]:
-; A320-NEXT: [[ACC_NEXT_LCSSA:%.*]] = phi double [ [[ACC_NEXT]], %[[LOOP]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
+; A320-NEXT: [[ACC_NEXT_LCSSA:%.*]] = phi double [ [[ACC_NEXT]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; A320-NEXT: br label %[[EXIT]]
; A320: [[EXIT]]:
; A320-NEXT: [[RES:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[ACC_NEXT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
@@ -322,7 +318,7 @@ exit:
; A320: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; A320: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
; A320: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; A320: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; A320: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
; A320: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; A320: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; A320: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
;.
>From 499ba5ecae1fa2c17a41c02b629beb2898eefda3 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Wed, 10 Dec 2025 10:18:10 +0000
Subject: [PATCH 5/5] Test failures
Change-Id: Id7010a4705058fb89bdb75ee1eb2335453d19447
---
llvm/lib/Target/AArch64/AArch64Features.td | 2 +-
llvm/lib/Target/AArch64/AArch64Processors.td | 3 ++-
llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 2 --
3 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 226f406e4ff5d..5d9878aac507c 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -920,7 +920,7 @@ def FeatureDisableUnpredicatedLdStLower : SubtargetFeature<
"true", "Disable lowering unpredicated loads/stores as LDR/STR">;
def FeatureAggressiveInterleaving : SubtargetFeature<"aggressive-interleaving",
- "AggressiveInterleaving", "false",
+ "AggressiveInterleaving", "true",
"Make use of aggressive interleaving during vectorization">;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 72882ac078c55..b656fca376b7f 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -23,7 +23,8 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
FeatureUseWzrToVecMove,
- FeatureUseFixedOverScalableIfEqualCost]>;
+ FeatureUseFixedOverScalableIfEqualCost,
+ FeatureAggressiveInterleaving]>;
def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
"Cortex-A53 ARM processors", [
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 13b5ca5e080b2..dae4f6a82e3aa 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -175,8 +175,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxBytesForLoopAlignment = 16;
break;
case CortexA320:
- AggressiveInterleaving = true;
- [[fallthrough]];
case CortexA510:
case CortexA520:
PrefFunctionAlignment = Align(16);
More information about the llvm-commits
mailing list