[llvm] 233fb98 - [ARM] Improve bitwise reduction costs

Mon Sep 4 08:22:57 PDT 2023

Author: David Green
Date: 2023-09-04T16:22:52+01:00
New Revision: 233fb987fcf6a5dc2d12b07cb8e30fffd5471871

URL: https://github.com/llvm/llvm-project/commit/233fb987fcf6a5dc2d12b07cb8e30fffd5471871
DIFF: https://github.com/llvm/llvm-project/commit/233fb987fcf6a5dc2d12b07cb8e30fffd5471871.diff

LOG: [ARM] Improve bitwise reduction costs

This adds some basic and/or/xor reduction costs for NEON/MVE, handling them
like other reductions where vector operations are used to reduce to legal
sizes, followed by an optional VREV+VAND/VORR/VEOR step and scalarization from
there.

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/ARM/reduce-bit.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 4df79da6e7e08a..e0d112c4a7eddb 100644

--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1707,6 +1707,34 @@ ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
                getArithmeticInstrCost(Opcode, ValTy->getElementType(), CostKind);
   }
 
+  if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
+      (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
+    unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
+    unsigned VecLimit =
+        ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
+    InstructionCost VecCost = 0;
+    while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
+      Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
+      VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
+      NumElts /= 2;
+    }
+    // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
+    // step.
+    if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
+        NumElts * EltSize == 64) {
+      Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
+      VecCost += ST->getMVEVectorCostFactor(CostKind) +
+                 getArithmeticInstrCost(Opcode, VecTy, CostKind);
+      NumElts /= 2;
+    }
+
+    // From here we extract the elements and perform the and/or/xor.
+    InstructionCost ExtractCost = NumElts;
+    return VecCost + ExtractCost +
+           (NumElts - 1) * getArithmeticInstrCost(
+                               Opcode, ValTy->getElementType(), CostKind);
+  }
+
   if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
       TTI::requiresOrderedReduction(FMF))
     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);

diff  --git a/llvm/test/Analysis/CostModel/ARM/reduce-bit.ll b/llvm/test/Analysis/CostModel/ARM/reduce-bit.ll
index e0ea2ab10d2390..b38660df59a3e7 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-bit.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-bit.ll
@@ -4,39 +4,39 @@
 
 define void @and() {
 ; CHECK-V8-LABEL: 'and'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1i64 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i64 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4i64 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %v8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i16 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %v8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 199 for instruction: %v16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %v8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 391 for instruction: %v16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 488 for instruction: %v32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
 ; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-MVEI-LABEL: 'and'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1i64 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %v2i64 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v4i64 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v2i16 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 394 for instruction: %v8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 524 for instruction: %v16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v2i8 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 394 for instruction: %v8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 1036 for instruction: %v16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 1294 for instruction: %v32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i64 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
 ; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
@@ -60,39 +60,39 @@ entry:
 
 define void @or() {
 ; CHECK-V8-LABEL: 'or'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1i64 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i64 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4i64 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %v8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i16 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %v8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 199 for instruction: %v16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %v8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 391 for instruction: %v16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 488 for instruction: %v32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
 ; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-MVEI-LABEL: 'or'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1i64 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %v2i64 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v4i64 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v2i16 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 394 for instruction: %v8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 524 for instruction: %v16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v2i8 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 394 for instruction: %v8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 1036 for instruction: %v16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 1294 for instruction: %v32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i64 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
 ; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
@@ -116,39 +116,39 @@ entry:
 
 define void @xor() {
 ; CHECK-V8-LABEL: 'xor'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1i64 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i64 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4i64 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %v8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i16 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %v8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 199 for instruction: %v16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %v8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 391 for instruction: %v16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 488 for instruction: %v32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
 ; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-MVEI-LABEL: 'xor'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1i64 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %v2i64 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v4i64 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v2i16 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 394 for instruction: %v8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 524 for instruction: %v16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v2i8 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 394 for instruction: %v8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 1036 for instruction: %v16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 1294 for instruction: %v32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i64 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
 ; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry: