[llvm] [AArch64][CostModel] Add constraints on which partial reductions are (PR #163728)
Sushant Gokhale via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 20 04:06:02 PDT 2025
https://github.com/sushgokh updated https://github.com/llvm/llvm-project/pull/163728
>From 431c530d85b779d75bc20a87ef543b74d023f309 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Thu, 16 Oct 2025 02:39:44 -0700
Subject: [PATCH 1/3] [AArch64][CostModel] Add constraints on which partial
reductions are natively supported on Neon and SVE
PR #158641 refined and refactored the cost model for partial reductions.
While doing so, it missed out on certain constraints. Specifically,
cases like i32 -> i64 partial reduce are not natively supported. This
patch adds back the condition/constraint that was present before PR #158641
---
.../AArch64/AArch64TargetTransformInfo.cpp | 3 +
.../LoopVectorize/AArch64/partial-reduce.ll | 82 +++++++++----------
2 files changed, 44 insertions(+), 41 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 479e34515fc8a..bfb2cacbd7dca 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5661,6 +5661,9 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
if (VF.getKnownMinValue() <= Ratio)
return Invalid;
+ // i32 -> i64 or i16 -> i32 is not natively supported on Neon and SVE.
+ if (Ratio < 4)
+ return Invalid;
VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
VectorType *AccumVectorType =
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 1e6bcb12029e7..0a61e8e9a505f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -1117,24 +1117,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP2]] = add <4 x i64> [[VEC_PHI]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -1143,42 +1143,42 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 16
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[WIDE_LOAD5]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <2 x i32> [[WIDE_LOAD6]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI1]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <2 x i64> [[VEC_PHI2]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <2 x i64> [[VEC_PHI3]], [[TMP7]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <4 x i64> [[VEC_PHI]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <4 x i64> [[VEC_PHI1]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <4 x i64> [[VEC_PHI2]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <4 x i64> [[VEC_PHI3]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]]
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <2 x i64> [[TMP10]], [[BIN_RDX]]
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <2 x i64> [[TMP11]], [[BIN_RDX7]]
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX8]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP9]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <4 x i64> [[TMP10]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <4 x i64> [[TMP11]], [[BIN_RDX7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX8]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -1187,24 +1187,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
-; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-MAXBW-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-MAXBW-NEXT: [[TMP2]] = add <4 x i64> [[VEC_PHI]], [[TMP1]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
>From ff2ff9308828d894942fcb3cfdcba85c0bdf5237 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Sun, 19 Oct 2025 18:21:57 -0700
Subject: [PATCH 2/3] address comments
This adds some additional cost modelling and corresponding test cases.
---
.../AArch64/AArch64TargetTransformInfo.cpp | 35 ++-
.../LoopVectorize/AArch64/partial-reduce.ll | 255 ++++++++++++++++--
2 files changed, 269 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index bfb2cacbd7dca..186dd19bfcf25 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5661,9 +5661,6 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
if (VF.getKnownMinValue() <= Ratio)
return Invalid;
- // i32 -> i64 or i16 -> i32 is not natively supported on Neon and SVE.
- if (Ratio < 4)
- return Invalid;
VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
VectorType *AccumVectorType =
@@ -5724,6 +5721,38 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
return Cost;
}
+ // FIXME:
+ // 1. Do cost modelling for USDOT.
+ // 2. Refactor the whole code here.
+ if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
+ if (AccumLT.second.getScalarType() == MVT::i32 &&
+ InputLT.second.getScalarType() == MVT::i16) {
+ // i16 -> i32 is supported in SVE 2.1
+ if (ST->hasSVE2p1())
+ return Cost;
+ // umlalt + umlalb. Same goes for signed types.
+ return Cost + 1;
+ }
+ if (AccumLT.second.getScalarType() == MVT::i64 &&
+ InputLT.second.getScalarType() == MVT::i32)
+ return Cost + 1;
+ }
+ if (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
+ ST->hasDotProd() && !IsUSDot) {
+ // umull + umull2 + (2 * uaddw) + (2 * uaddw2). Same goes for signed types.
+ if (AccumLT.second.getScalarType() == MVT::i64 &&
+ InputLT.second.getScalarType() == MVT::i16)
+ return Cost + 5;
+ // umlal + umlal2. Same goes for signed types.
+ if ((AccumLT.second.getScalarType() == MVT::i32 &&
+ InputLT.second.getScalarType() == MVT::i16) ||
+ (AccumLT.second.getScalarType() == MVT::i64 &&
+ InputLT.second.getScalarType() == MVT::i32))
+ return Cost + 1;
+ }
+
+ // FIXME: This should be more expensive for NEON as we see fmov instructions
+ // with very low throughput.
// Add additional cost for the extends that would need to be inserted.
return Cost + 4;
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 0a61e8e9a505f..94832b97ecb65 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -1125,16 +1125,16 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP2]] = add <4 x i64> [[VEC_PHI]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -1151,10 +1151,10 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
@@ -1167,18 +1167,18 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <4 x i64> [[VEC_PHI]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <4 x i64> [[VEC_PHI1]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <4 x i64> [[VEC_PHI2]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <4 x i64> [[VEC_PHI3]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP15]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI1]], <4 x i64> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI2]], <4 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI3]], <4 x i64> [[TMP7]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP9]], [[TMP8]]
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <4 x i64> [[TMP10]], [[BIN_RDX]]
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <4 x i64> [[TMP11]], [[BIN_RDX7]]
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX8]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX10:%.*]] = add <2 x i64> [[PARTIAL_REDUCE8]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX11:%.*]] = add <2 x i64> [[PARTIAL_REDUCE9]], [[BIN_RDX10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX11]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -1195,16 +1195,16 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-MAXBW-NEXT: [[TMP2]] = add <4 x i64> [[VEC_PHI]], [[TMP1]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
@@ -1227,9 +1227,228 @@ exit:
ret i64 %add
}
+; FIXME: With SVE 2.1, this should generate vscale type vectors.
+define i32 @sext_reduction_i16_to_i32_with_sve2p1(ptr %arr, i64 %n) #3 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @sext_reduction_i16_to_i32_with_sve2p1(
+; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI]], <8 x i32> [[TMP1]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1: scalar.ph:
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @sext_reduction_i16_to_i32_with_sve2p1(
+; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD2]] to <8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI]], <8 x i32> [[TMP2]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI1]], <8 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define i32 @sext_reduction_i16_to_i32_with_sve2p1(
+; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
+; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI]], <8 x i32> [[TMP1]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-MAXBW: middle.block:
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW: scalar.ph:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %acc = phi i32 [ 0, %entry ], [ %add, %loop ]
+ %gep = getelementptr inbounds i16, ptr %arr, i64 %iv
+ %load = load i16, ptr %gep
+ %sext = sext i16 %load to i32
+ %add = add i32 %acc, %sext
+ %iv.next = add i64 %iv, 1
+ %cmp = icmp ult i64 %iv.next, %n
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+ ret i32 %add
+}
+
+define i32 @sext_reduction_i16_to_i32(ptr %arr, i64 %n) #1 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @sext_reduction_i16_to_i32(
+; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI]], <8 x i32> [[TMP1]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1: scalar.ph:
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @sext_reduction_i16_to_i32(
+; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 32
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 32
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 24
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP8]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD2]] to <8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[WIDE_LOAD5]] to <8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[WIDE_LOAD6]] to <8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI]], <8 x i32> [[TMP2]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI1]], <8 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI2]], <8 x i32> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI3]], <8 x i32> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[PARTIAL_REDUCE8]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[PARTIAL_REDUCE9]], [[BIN_RDX10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define i32 @sext_reduction_i16_to_i32(
+; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
+; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI]], <8 x i32> [[TMP1]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-MAXBW: middle.block:
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW: scalar.ph:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %acc = phi i32 [ 0, %entry ], [ %add, %loop ]
+ %gep = getelementptr inbounds i16, ptr %arr, i64 %iv
+ %load = load i16, ptr %gep
+ %sext = sext i16 %load to i32
+ %add = add i32 %acc, %sext
+ %iv.next = add i64 %iv, 1
+ %cmp = icmp ult i64 %iv.next, %n
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+ ret i32 %add
+}
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
attributes #1 = { vscale_range(1,16) "target-features"="+neon,+dotprod,+sve" "target-cpu"="neoverse-v2" }
attributes #2 = { "target-features"="+neon,+dotprod" }
+attributes #3 = { vscale_range(1,16) "target-features"="+sve2p1" }
>From 4194629ecb9fdf2c243c8806d6f395a701354028 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Mon, 20 Oct 2025 04:03:45 -0700
Subject: [PATCH 3/3] address comments
---
.../AArch64/AArch64TargetTransformInfo.cpp | 34 +--
.../LoopVectorize/AArch64/partial-reduce.ll | 219 ------------------
2 files changed, 1 insertion(+), 252 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 186dd19bfcf25..e3370d31a0e39 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5721,40 +5721,8 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
return Cost;
}
- // FIXME:
- // 1. Do cost modelling for USDOT.
- // 2. Refactor the whole code here.
- if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
- if (AccumLT.second.getScalarType() == MVT::i32 &&
- InputLT.second.getScalarType() == MVT::i16) {
- // i16 -> i32 is supported in SVE 2.1
- if (ST->hasSVE2p1())
- return Cost;
- // umlalt + umlalb. Same goes for signed types.
- return Cost + 1;
- }
- if (AccumLT.second.getScalarType() == MVT::i64 &&
- InputLT.second.getScalarType() == MVT::i32)
- return Cost + 1;
- }
- if (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
- ST->hasDotProd() && !IsUSDot) {
- // umull + umull2 + (2 * uaddw) + (2 * uaddw2). Same goes for signed types.
- if (AccumLT.second.getScalarType() == MVT::i64 &&
- InputLT.second.getScalarType() == MVT::i16)
- return Cost + 5;
- // umlal + umlal2. Same goes for signed types.
- if ((AccumLT.second.getScalarType() == MVT::i32 &&
- InputLT.second.getScalarType() == MVT::i16) ||
- (AccumLT.second.getScalarType() == MVT::i64 &&
- InputLT.second.getScalarType() == MVT::i32))
- return Cost + 1;
- }
-
- // FIXME: This should be more expensive for NEON as we see fmov instructions
- // with very low throughput.
// Add additional cost for the extends that would need to be inserted.
- return Cost + 4;
+ return Cost + 2;
}
InstructionCost
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 94832b97ecb65..eb1da3ce3be64 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -1227,228 +1227,9 @@ exit:
ret i64 %add
}
-; FIXME: With SVE 2.1, this should generate vscale type vectors.
-define i32 @sext_reduction_i16_to_i32_with_sve2p1(ptr %arr, i64 %n) #3 {
-; CHECK-INTERLEAVE1-LABEL: define i32 @sext_reduction_i16_to_i32_with_sve2p1(
-; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-INTERLEAVE1-NEXT: entry:
-; CHECK-INTERLEAVE1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
-; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
-; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1: vector.body:
-; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI]], <8 x i32> [[TMP1]])
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVE1: scalar.ph:
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @sext_reduction_i16_to_i32_with_sve2p1(
-; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
-; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 16
-; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED: vector.body:
-; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 8
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD2]] to <8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI]], <8 x i32> [[TMP2]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI1]], <8 x i32> [[TMP3]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVED: scalar.ph:
-;
-; CHECK-MAXBW-LABEL: define i32 @sext_reduction_i16_to_i32_with_sve2p1(
-; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-MAXBW-NEXT: entry:
-; CHECK-MAXBW-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
-; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW: vector.ph:
-; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
-; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW: vector.body:
-; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI]], <8 x i32> [[TMP1]])
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-MAXBW: scalar.ph:
-;
-entry:
- br label %loop
-
-loop:
- %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
- %acc = phi i32 [ 0, %entry ], [ %add, %loop ]
- %gep = getelementptr inbounds i16, ptr %arr, i64 %iv
- %load = load i16, ptr %gep
- %sext = sext i16 %load to i32
- %add = add i32 %acc, %sext
- %iv.next = add i64 %iv, 1
- %cmp = icmp ult i64 %iv.next, %n
- br i1 %cmp, label %loop, label %exit
-
-exit:
- ret i32 %add
-}
-
-define i32 @sext_reduction_i16_to_i32(ptr %arr, i64 %n) #1 {
-; CHECK-INTERLEAVE1-LABEL: define i32 @sext_reduction_i16_to_i32(
-; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
-; CHECK-INTERLEAVE1-NEXT: entry:
-; CHECK-INTERLEAVE1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
-; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
-; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1: vector.body:
-; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI]], <8 x i32> [[TMP1]])
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVE1: scalar.ph:
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @sext_reduction_i16_to_i32(
-; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
-; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 32
-; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 32
-; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED: vector.body:
-; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 8
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 16
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 24
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP8]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD2]] to <8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[WIDE_LOAD5]] to <8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[WIDE_LOAD6]] to <8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI]], <8 x i32> [[TMP2]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI1]], <8 x i32> [[TMP3]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI2]], <8 x i32> [[TMP6]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI3]], <8 x i32> [[TMP7]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[PARTIAL_REDUCE8]], [[BIN_RDX]]
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[PARTIAL_REDUCE9]], [[BIN_RDX10]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVED: scalar.ph:
-;
-; CHECK-MAXBW-LABEL: define i32 @sext_reduction_i16_to_i32(
-; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
-; CHECK-MAXBW-NEXT: entry:
-; CHECK-MAXBW-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
-; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW: vector.ph:
-; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
-; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW: vector.body:
-; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[VEC_PHI]], <8 x i32> [[TMP1]])
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-MAXBW: scalar.ph:
-;
-entry:
- br label %loop
-
-loop:
- %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
- %acc = phi i32 [ 0, %entry ], [ %add, %loop ]
- %gep = getelementptr inbounds i16, ptr %arr, i64 %iv
- %load = load i16, ptr %gep
- %sext = sext i16 %load to i32
- %add = add i32 %acc, %sext
- %iv.next = add i64 %iv, 1
- %cmp = icmp ult i64 %iv.next, %n
- br i1 %cmp, label %loop, label %exit
-
-exit:
- ret i32 %add
-}
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
attributes #1 = { vscale_range(1,16) "target-features"="+neon,+dotprod,+sve" "target-cpu"="neoverse-v2" }
attributes #2 = { "target-features"="+neon,+dotprod" }
-attributes #3 = { vscale_range(1,16) "target-features"="+sve2p1" }
More information about the llvm-commits
mailing list