[llvm] b93d74a - [ARM] Basic getArithmeticReductionCost reduction costs

Sat Oct 17 02:29:46 PDT 2020

Author: David Green
Date: 2020-10-17T10:29:00+01:00
New Revision: b93d74ac9c4e304db6fbc434992c098104ba94a5

URL: https://github.com/llvm/llvm-project/commit/b93d74ac9c4e304db6fbc434992c098104ba94a5
DIFF: https://github.com/llvm/llvm-project/commit/b93d74ac9c4e304db6fbc434992c098104ba94a5.diff

LOG: [ARM] Basic getArithmeticReductionCost reduction costs

This adds some basic costs for MVE reductions - currently just costing
the simple legal add vectors as a single MVE instruction. More complex
costing can be added in the future when the framework more readily
allows it.

Differential Revision: https://reviews.llvm.org/D88980

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
    llvm/lib/Target/ARM/ARMTargetTransformInfo.h
    llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 1cb9e7283007..c26a77b46091 100644

--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/PatternMatch.h"
@@ -1409,6 +1410,29 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
   return ScalarCost;
 }
 
+int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+                                           bool IsPairwiseForm,
+                                           TTI::TargetCostKind CostKind) {
+  EVT ValVT = TLI->getValueType(DL, ValTy);
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
+    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
+                                             CostKind);
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+  static const CostTblEntry CostTblAdd[]{
+      {ISD::ADD, MVT::v16i8, 1},
+      {ISD::ADD, MVT::v8i16, 1},
+      {ISD::ADD, MVT::v4i32, 1},
+  };
+  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
+    return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first;
+
+  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
+                                           CostKind);
+}
+
 int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                       TTI::TargetCostKind CostKind) {
   // Currently we make a somewhat optimistic assumption that active_lane_mask's

diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index ee80be5d7b48..9a0d71924b51 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -247,6 +247,10 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
                                   Align Alignment, TTI::TargetCostKind CostKind,
                                   const Instruction *I = nullptr);
 
+  int getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+                                 bool IsPairwiseForm,
+                                 TTI::TargetCostKind CostKind);
+
   int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                             TTI::TargetCostKind CostKind);
 

diff  --git a/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll b/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll
index 90de689246bd..8b59de716fcc 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll
@@ -5,9 +5,9 @@ define void @add_i8() {
 ; CHECK-LABEL: 'add_i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4120 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
@@ -34,22 +34,22 @@ define void @add_i16() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sa)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2za = zext <4 x i8> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2za)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sa = sext <4 x i8> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sa)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3za = zext <8 x i8> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3za)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sa = sext <8 x i8> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sa)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4za = zext <16 x i8> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4za)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sa = sext <16 x i8> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sa)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %a0za = zext <1 x i8> undef to <1 x i16>
@@ -106,17 +106,17 @@ define void @add_i32() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sa)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2za = zext <4 x i8> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2za)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sa = sext <4 x i8> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sa)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3za = zext <8 x i8> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3za)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sa = sext <8 x i8> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sa)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4za = zext <16 x i8> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4za)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sa = sext <16 x i8> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sa)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5za = zext <1 x i16> undef to <1 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5za)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sa = sext <1 x i16> undef to <1 x i32>
@@ -126,22 +126,22 @@ define void @add_i32() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sa = sext <2 x i16> undef to <2 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sa)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7za = zext <4 x i16> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7za)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sa = sext <4 x i16> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sa)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8za = zext <8 x i16> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8za)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sa = sext <8 x i16> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sa)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9za = zext <16 x i16> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9za)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sa = sext <16 x i16> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sa)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %a0za = zext <1 x i8> undef to <1 x i32>
@@ -396,11 +396,11 @@ define void @mla_i8() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a1m = mul <2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a1m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2m = mul <4 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a2m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a2m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3m = mul <8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a3m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a3m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4m = mul <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4120 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a4m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a4m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %a0m = mul <1 x i8> undef, undef
@@ -442,37 +442,37 @@ define void @mla_i16() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2za = zext <4 x i8> undef to <4 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zb = zext <4 x i8> undef to <4 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zm = mul <4 x i16> %a2za, %a2zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2zm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sa = sext <4 x i8> undef to <4 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sb = sext <4 x i8> undef to <4 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sm = mul <4 x i16> %a2sa, %a2sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3za = zext <8 x i8> undef to <8 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3zb = zext <8 x i8> undef to <8 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3zm = mul <8 x i16> %a3za, %a3zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3zm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sa = sext <8 x i8> undef to <8 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sb = sext <8 x i8> undef to <8 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sm = mul <8 x i16> %a3sa, %a3sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4za = zext <16 x i8> undef to <16 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4zb = zext <16 x i8> undef to <16 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4zm = mul <16 x i16> %a4za, %a4zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4zm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sa = sext <16 x i8> undef to <16 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sb = sext <16 x i8> undef to <16 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4sm = mul <16 x i16> %a4sa, %a4sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5m = mul <1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a5m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a6m = mul <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a6m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7m = mul <4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a7m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a7m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8m = mul <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a8m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a8m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9m = mul <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a9m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a9m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %a0za = zext <1 x i8> undef to <1 x i16>
@@ -564,27 +564,27 @@ define void @mla_i32() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2za = zext <4 x i8> undef to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2zb = zext <4 x i8> undef to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zm = mul <4 x i32> %a2za, %a2zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2zm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sa = sext <4 x i8> undef to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sb = sext <4 x i8> undef to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sm = mul <4 x i32> %a2sa, %a2sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3za = zext <8 x i8> undef to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3zb = zext <8 x i8> undef to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3zm = mul <8 x i32> %a3za, %a3zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3zm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sa = sext <8 x i8> undef to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sb = sext <8 x i8> undef to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3sm = mul <8 x i32> %a3sa, %a3sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4za = zext <16 x i8> undef to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4zb = zext <16 x i8> undef to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4zm = mul <16 x i32> %a4za, %a4zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4zm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sa = sext <16 x i8> undef to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sb = sext <16 x i8> undef to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4sm = mul <16 x i32> %a4sa, %a4sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5za = zext <1 x i16> undef to <1 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5zb = zext <1 x i16> undef to <1 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5zm = mul <1 x i32> %a5za, %a5zb
@@ -604,37 +604,37 @@ define void @mla_i32() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7za = zext <4 x i16> undef to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7zb = zext <4 x i16> undef to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7zm = mul <4 x i32> %a7za, %a7zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7zm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sa = sext <4 x i16> undef to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sb = sext <4 x i16> undef to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sm = mul <4 x i32> %a7sa, %a7sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8za = zext <8 x i16> undef to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8zb = zext <8 x i16> undef to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8zm = mul <8 x i32> %a8za, %a8zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8zm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sa = sext <8 x i16> undef to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sb = sext <8 x i16> undef to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8sm = mul <8 x i32> %a8sa, %a8sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9za = zext <16 x i16> undef to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9zb = zext <16 x i16> undef to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9zm = mul <16 x i32> %a9za, %a9zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9zm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sa = sext <16 x i16> undef to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sb = sext <16 x i16> undef to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9sm = mul <16 x i32> %a9sa, %a9sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sm)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a10m = mul <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a10m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a11m = mul <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a11m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12m = mul <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a12m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a12m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13m = mul <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a13m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a13m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14m = mul <16 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a14m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a14m)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %a0za = zext <1 x i8> undef to <1 x i32>