[llvm] [AArch64][CostModel] Consider i32 --> i64 partial reduce cost as Invalid for FixedLength vectors (PR #165226)
Sushant Gokhale via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 27 03:05:09 PDT 2025
https://github.com/sushgokh created https://github.com/llvm/llvm-project/pull/165226
PR #163728 was intended to address Numba regressions on Neoverse-V2 which have patterns of the form
```
size_t foo(int* a)
{
for (...)
sum += (size_t) a[i] // here, size_t implies 64 bit int
return sum
}
```
Unfortunately, it introduced more regression.
Briefly, the cause of the regression is partial reduce intrinsics, for the code like above, on Neoverse-V2, are lowered to SMLAL[BT] as compared to SADDW previously. SMLAL[BT], as per SWOG, have higher latency and less throughput as compared to SADDW.
This patch tries to address the above issue. More detailed writeup in the PR.
>From 733836f587beaa5540b1bbad0adf8699933a4fe7 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Mon, 27 Oct 2025 02:45:17 -0700
Subject: [PATCH 1/2] Revert "[AArch64][CostModel] Add constraints on which
partial reductions are (#163728)"
This reverts commit 005ec78b71c8790a58cb1124e8ca2fcf47a880a0.
---
.../AArch64/AArch64TargetTransformInfo.cpp | 2 +-
.../LoopVectorize/AArch64/partial-reduce.ll | 82 +++++++++----------
2 files changed, 42 insertions(+), 42 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index fede586cf35bc..9ccfcecb2e20e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5758,7 +5758,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
}
// Add additional cost for the extends that would need to be inserted.
- return Cost + 2;
+ return Cost + 4;
}
InstructionCost
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 46ec858d7455c..522a6d69d3a6c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -1117,24 +1117,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]])
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -1143,42 +1143,42 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP15]])
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI1]], <4 x i64> [[TMP5]])
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI2]], <4 x i64> [[TMP6]])
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI3]], <4 x i64> [[TMP7]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[WIDE_LOAD5]] to <2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <2 x i32> [[WIDE_LOAD6]] to <2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI1]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <2 x i64> [[VEC_PHI2]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <2 x i64> [[VEC_PHI3]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX10:%.*]] = add <2 x i64> [[PARTIAL_REDUCE8]], [[BIN_RDX]]
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX11:%.*]] = add <2 x i64> [[PARTIAL_REDUCE9]], [[BIN_RDX10]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX11]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <2 x i64> [[TMP10]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <2 x i64> [[TMP11]], [[BIN_RDX7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX8]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -1187,24 +1187,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
-; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]])
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; CHECK-MAXBW-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
>From 62ebd18bb21f956febca0d43950b7a492793f881 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Mon, 27 Oct 2025 02:53:47 -0700
Subject: [PATCH 2/2] [AArch64][CostModel] Consider i32 to i64 partial reduce
cost as Invalid
This addresses the issue introduced by PR #158641 and further aggravated
by PR #163728
---
.../AArch64/AArch64TargetTransformInfo.cpp | 7 ++
.../AArch64/partial-reduce-constant-ops.ll | 16 ++--
.../LoopVectorize/AArch64/partial-reduce.ll | 82 +++++++++----------
3 files changed, 56 insertions(+), 49 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9ccfcecb2e20e..6945ee19caa72 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5757,6 +5757,13 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
return Cost;
}
+ if (!ST->useSVEForFixedLengthVectors() &&
+ (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
+ ST->hasDotProd()) &&
+ (AccumLT.second.getScalarType() == MVT::i64 &&
+ InputLT.second.getScalarType() == MVT::i32))
+ return Invalid;
+
// Add additional cost for the extends that would need to be inserted.
return Cost + 4;
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
index b430efc9e5283..16df0f08d0a7c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
@@ -482,29 +482,29 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT]] to <8 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
+; CHECK-NEXT: [[TMP8]] = add <8 x i64> [[VEC_PHI]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -520,9 +520,9 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
; CHECK-NEXT: [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
; CHECK-NEXT: [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
; CHECK-NEXT: [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[ADD_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 522a6d69d3a6c..0dfa6962700b3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -1117,24 +1117,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP2]] = add <4 x i64> [[VEC_PHI]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -1143,42 +1143,42 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 16
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[WIDE_LOAD5]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <2 x i32> [[WIDE_LOAD6]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI1]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <2 x i64> [[VEC_PHI2]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <2 x i64> [[VEC_PHI3]], [[TMP7]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <4 x i64> [[VEC_PHI]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <4 x i64> [[VEC_PHI1]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <4 x i64> [[VEC_PHI2]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <4 x i64> [[VEC_PHI3]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]]
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <2 x i64> [[TMP10]], [[BIN_RDX]]
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <2 x i64> [[TMP11]], [[BIN_RDX7]]
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX8]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP9]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <4 x i64> [[TMP10]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <4 x i64> [[TMP11]], [[BIN_RDX7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX8]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -1187,24 +1187,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
-; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-MAXBW-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-MAXBW-NEXT: [[TMP2]] = add <4 x i64> [[VEC_PHI]], [[TMP1]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
More information about the llvm-commits
mailing list