[llvm] [RISCV][TTI] Implement instruction cost for vp.reduce.* (PR #114184)

Tue Oct 29 22:19:31 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: Elvis Wang (ElvisWang123)

<details>
<summary>Changes</summary>

This patch based on #114180. 

---

Patch is 624.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114184.diff


14 Files Affected:

- (modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp (+74-1) 
- (modified) llvm/test/Analysis/CostModel/RISCV/reduce-add.ll (+131-4) 
- (modified) llvm/test/Analysis/CostModel/RISCV/reduce-and.ll (+140-4) 
- (modified) llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll (+211-3) 
- (modified) llvm/test/Analysis/CostModel/RISCV/reduce-fmaximum.ll (+178-4) 
- (modified) llvm/test/Analysis/CostModel/RISCV/reduce-fminimum.ll (+94-4) 
- (modified) llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll (+211-3) 
- (modified) llvm/test/Analysis/CostModel/RISCV/reduce-max.ll (+257-5) 
- (modified) llvm/test/Analysis/CostModel/RISCV/reduce-min.ll (+256-4) 
- (modified) llvm/test/Analysis/CostModel/RISCV/reduce-or.ll (+140-4) 
- (modified) llvm/test/Analysis/CostModel/RISCV/reduce-scalable-fp.ll (+207-4) 
- (modified) llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll (+348-4) 
- (modified) llvm/test/Analysis/CostModel/RISCV/reduce-xor.ll (+131-4) 
- (modified) llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll (+32-32) 


``````````diff

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 395baa5f1aab99..627eb9cc4bc5fe 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1179,6 +1179,79 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(),
                               ICA.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE,
                               CostKind);
+  case Intrinsic::vp_reduce_add:
+  case Intrinsic::vp_reduce_fadd:
+  case Intrinsic::vp_reduce_mul:
+  case Intrinsic::vp_reduce_fmul:
+  case Intrinsic::vp_reduce_and:
+  case Intrinsic::vp_reduce_or:
+  case Intrinsic::vp_reduce_xor: {
+    unsigned Opcode;
+    switch (ICA.getID()) {
+    case Intrinsic::vp_reduce_add:
+      Opcode = Instruction::Add;
+      break;
+    case Intrinsic::vp_reduce_fadd:
+      Opcode = Instruction::FAdd;
+      break;
+    case Intrinsic::vp_reduce_mul:
+      Opcode = Instruction::Mul;
+      break;
+    case Intrinsic::vp_reduce_fmul:
+      Opcode = Instruction::FMul;
+      break;
+    case Intrinsic::vp_reduce_and:
+      Opcode = Instruction::And;
+      break;
+    case Intrinsic::vp_reduce_or:
+      Opcode = Instruction::Or;
+      break;
+    case Intrinsic::vp_reduce_xor:
+      Opcode = Instruction::Xor;
+      break;
+    }
+    return getArithmeticReductionCost(Opcode,
+                                      cast<VectorType>(ICA.getArgTypes()[1]),
+                                      ICA.getFlags(), CostKind);
+  }
+  case Intrinsic::vp_reduce_smax:
+  case Intrinsic::vp_reduce_smin:
+  case Intrinsic::vp_reduce_umax:
+  case Intrinsic::vp_reduce_umin:
+  case Intrinsic::vp_reduce_fmax:
+  case Intrinsic::vp_reduce_fmaximum:
+  case Intrinsic::vp_reduce_fmin:
+  case Intrinsic::vp_reduce_fminimum: {
+    unsigned IID;
+    switch (ICA.getID()) {
+    case Intrinsic::vp_reduce_smax:
+      IID = Intrinsic::smax;
+      break;
+    case Intrinsic::vp_reduce_smin:
+      IID = Intrinsic::smin;
+      break;
+    case Intrinsic::vp_reduce_umax:
+      IID = Intrinsic::umax;
+      break;
+    case Intrinsic::vp_reduce_umin:
+      IID = Intrinsic::umin;
+      break;
+    case Intrinsic::vp_reduce_fmax:
+      IID = Intrinsic::maxnum;
+      break;
+    case Intrinsic::vp_reduce_fmaximum:
+      IID = Intrinsic::maximum;
+      break;
+    case Intrinsic::vp_reduce_fmin:
+      IID = Intrinsic::minnum;
+      break;
+    case Intrinsic::vp_reduce_fminimum:
+      IID = Intrinsic::minimum;
+      break;
+    }
+    return getMinMaxReductionCost(IID, cast<VectorType>(ICA.getArgTypes()[1]),
+                                  ICA.getFlags(), CostKind);
+  }
   }
 
   if (ST->hasVInstructions() && RetTy->isVectorTy()) {
@@ -1552,7 +1625,7 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
   }
 
   // IR Reduction is composed by two vmv and one rvv reduction instruction.
-  if (TTI::requiresOrderedReduction(FMF)) {
+  if (TTI::requiresOrderedReduction(FMF) && ElementTy->isFloatingPointTy()) {
     Opcodes.push_back(RISCV::VFMV_S_F);
     for (unsigned i = 0; i < LT.first.getValue(); i++)
       Opcodes.push_back(RISCV::VFREDOSUM_VS);
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll
index 6032ae01aa718b..70687da17eb1a5 100644
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=riscv32 -mattr=+v -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
-; RUN: opt < %s -mtriple=riscv64 -mattr=+v -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
-; RUN: opt < %s -mtriple=riscv32 -mattr=+v -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s --check-prefix=SIZE
-; RUN: opt < %s -mtriple=riscv64 -mattr=+v -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s --check-prefix=SIZE
+; RUN: opt < %s -mtriple=riscv32 -mattr=+v -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output --type-based-intrinsic-cost=true | FileCheck %s
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output --type-based-intrinsic-cost=true | FileCheck %s
+; RUN: opt < %s -mtriple=riscv32 -mattr=+v -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output --type-based-intrinsic-cost=true \
+; RUN: | FileCheck %s --check-prefix=SIZE
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output --type-based-intrinsic-cost=true \
+; RUN: | FileCheck %s --check-prefix=SIZE
 
 define i32 @reduce_i1(i32 %arg) {
 ; CHECK-LABEL: 'reduce_i1'
@@ -14,6 +16,14 @@ define i32 @reduce_i1(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i1 @llvm.vector.reduce.add.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i1 @llvm.vector.reduce.add.v128i1(<128 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1_vp = call i1 @llvm.vp.reduce.add.v1i1(i1 undef, <1 x i1> undef, <1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_vp = call i1 @llvm.vp.reduce.add.v2i1(i1 undef, <2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4_vp = call i1 @llvm.vp.reduce.add.v4i1(i1 undef, <4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8_vp = call i1 @llvm.vp.reduce.add.v8i1(i1 undef, <8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16_vp = call i1 @llvm.vp.reduce.add.v16i1(i1 undef, <16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32_vp = call i1 @llvm.vp.reduce.add.v32i1(i1 undef, <32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64_vp = call i1 @llvm.vp.reduce.add.v64i1(i1 undef, <64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_vp = call i1 @llvm.vp.reduce.add.v128i1(i1 undef, <128 x i1> undef, <128 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIZE-LABEL: 'reduce_i1'
@@ -25,6 +35,14 @@ define i32 @reduce_i1(i32 %arg) {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i1 @llvm.vector.reduce.add.v64i1(<64 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i1 @llvm.vector.reduce.add.v128i1(<128 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1_vp = call i1 @llvm.vp.reduce.add.v1i1(i1 undef, <1 x i1> undef, <1 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_vp = call i1 @llvm.vp.reduce.add.v2i1(i1 undef, <2 x i1> undef, <2 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4_vp = call i1 @llvm.vp.reduce.add.v4i1(i1 undef, <4 x i1> undef, <4 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8_vp = call i1 @llvm.vp.reduce.add.v8i1(i1 undef, <8 x i1> undef, <8 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16_vp = call i1 @llvm.vp.reduce.add.v16i1(i1 undef, <16 x i1> undef, <16 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32_vp = call i1 @llvm.vp.reduce.add.v32i1(i1 undef, <32 x i1> undef, <32 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64_vp = call i1 @llvm.vp.reduce.add.v64i1(i1 undef, <64 x i1> undef, <64 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_vp = call i1 @llvm.vp.reduce.add.v128i1(i1 undef, <128 x i1> undef, <128 x i1> undef, i32 undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> undef)
@@ -35,6 +53,15 @@ define i32 @reduce_i1(i32 %arg) {
   %V32  = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
   %V64  = call i1 @llvm.vector.reduce.add.v64i1(<64 x i1> undef)
   %V128 = call i1 @llvm.vector.reduce.add.v128i1(<128 x i1> undef)
+
+  %V1_vp   = call i1 @llvm.vp.reduce.add.v1i1(i1 undef, <1 x i1> undef, <1 x i1> undef, i32 undef)
+  %V2_vp   = call i1 @llvm.vp.reduce.add.v2i1(i1 undef, <2 x i1> undef, <2 x i1> undef, i32 undef)
+  %V4_vp   = call i1 @llvm.vp.reduce.add.v4i1(i1 undef, <4 x i1> undef, <4 x i1> undef, i32 undef)
+  %V8_vp   = call i1 @llvm.vp.reduce.add.v8i1(i1 undef, <8 x i1> undef, <8 x i1> undef, i32 undef)
+  %V16_vp  = call i1 @llvm.vp.reduce.add.v16i1(i1 undef, <16 x i1> undef, <16 x i1> undef, i32 undef)
+  %V32_vp  = call i1 @llvm.vp.reduce.add.v32i1(i1 undef, <32 x i1> undef, <32 x i1> undef, i32 undef)
+  %V64_vp  = call i1 @llvm.vp.reduce.add.v64i1(i1 undef, <64 x i1> undef, <64 x i1> undef, i32 undef)
+  %V128_vp = call i1 @llvm.vp.reduce.add.v128i1(i1 undef, <128 x i1> undef, <128 x i1> undef, i32 undef)
   ret i32 undef
 }
 
@@ -48,6 +75,14 @@ define i32 @reduce_i8(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1_vp = call i8 @llvm.vp.reduce.add.v1i8(i8 undef, <1 x i8> undef, <1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2_vp = call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4_vp = call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8_vp = call i8 @llvm.vp.reduce.add.v8i8(i8 undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16_vp = call i8 @llvm.vp.reduce.add.v16i8(i8 undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32_vp = call i8 @llvm.vp.reduce.add.v32i8(i8 undef, <32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64_vp = call i8 @llvm.vp.reduce.add.v64i8(i8 undef, <64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128_vp = call i8 @llvm.vp.reduce.add.v128i8(i8 undef, <128 x i8> undef, <128 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIZE-LABEL: 'reduce_i8'
@@ -59,6 +94,14 @@ define i32 @reduce_i8(i32 %arg) {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1_vp = call i8 @llvm.vp.reduce.add.v1i8(i8 undef, <1 x i8> undef, <1 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2_vp = call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4_vp = call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8_vp = call i8 @llvm.vp.reduce.add.v8i8(i8 undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16_vp = call i8 @llvm.vp.reduce.add.v16i8(i8 undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32_vp = call i8 @llvm.vp.reduce.add.v32i8(i8 undef, <32 x i8> undef, <32 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_vp = call i8 @llvm.vp.reduce.add.v64i8(i8 undef, <64 x i8> undef, <64 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_vp = call i8 @llvm.vp.reduce.add.v128i8(i8 undef, <128 x i8> undef, <128 x i1> undef, i32 undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V1   = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
@@ -69,6 +112,15 @@ define i32 @reduce_i8(i32 %arg) {
   %V32  = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
   %V64  = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
   %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+
+  %V1_vp   = call i8 @llvm.vp.reduce.add.v1i8(i8 undef, <1 x i8> undef, <1 x i1> undef, i32 undef)
+  %V2_vp   = call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+  %V4_vp   = call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+  %V8_vp   = call i8 @llvm.vp.reduce.add.v8i8(i8 undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+  %V16_vp  = call i8 @llvm.vp.reduce.add.v16i8(i8 undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+  %V32_vp  = call i8 @llvm.vp.reduce.add.v32i8(i8 undef, <32 x i8> undef, <32 x i1> undef, i32 undef)
+  %V64_vp  = call i8 @llvm.vp.reduce.add.v64i8(i8 undef, <64 x i8> undef, <64 x i1> undef, i32 undef)
+  %V128_vp = call i8 @llvm.vp.reduce.add.v128i8(i8 undef, <128 x i8> undef, <128 x i1> undef, i32 undef)
   ret i32 undef
 }
 
@@ -82,6 +134,14 @@ define i32 @reduce_i16(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1_vp = call i16 @llvm.vp.reduce.add.v1i16(i16 undef, <1 x i16> undef, <1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2_vp = call i16 @llvm.vp.reduce.add.v2i16(i16 undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4_vp = call i16 @llvm.vp.reduce.add.v4i16(i16 undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8_vp = call i16 @llvm.vp.reduce.add.v8i16(i16 undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16_vp = call i16 @llvm.vp.reduce.add.v16i16(i16 undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32_vp = call i16 @llvm.vp.reduce.add.v32i16(i16 undef, <32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64_vp = call i16 @llvm.vp.reduce.add.v64i16(i16 undef, <64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128_vp = call i16 @llvm.vp.reduce.add.v128i16(i16 undef, <128 x i16> undef, <128 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIZE-LABEL: 'reduce_i16'
@@ -93,6 +153,14 @@ define i32 @reduce_i16(i32 %arg) {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1_vp = call i16 @llvm.vp.reduce.add.v1i16(i16 undef, <1 x i16> undef, <1 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2_vp = call i16 @llvm.vp.reduce.add.v2i16(i16 undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4_vp = call i16 @llvm.vp.reduce.add.v4i16(i16 undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8_vp = call i16 @llvm.vp.reduce.add.v8i16(i16 undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16_vp = call i16 @llvm.vp.reduce.add.v16i16(i16 undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32_vp = call i16 @llvm.vp.reduce.add.v32i16(i16 undef, <32 x i16> undef, <32 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_vp = call i16 @llvm.vp.reduce.add.v64i16(i16 undef, <64 x i16> undef, <64 x i1> undef, i32 undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128_vp = call i16 @llvm.vp.reduce.add.v128i16(i16 undef, <128 x i16> undef, <128 x i1> undef, i32 undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V1   = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
@@ -103,6 +171,15 @@ define i32 @reduce_i16(i32 %arg) {
   %V32  = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
   %V64  = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
   %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+
+  %V1_vp   = call i16 @llvm.vp.reduce.add.v1i16(i16 undef, <1 x i16> undef, <1 x i1> undef, i32 undef)
+  %V2_vp   = call i16 @llvm.vp.reduce.add.v2i16(i16 undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+  %V4_vp   = call i16 @llvm.vp.reduce.add.v4i16(i16 undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+  %V8_vp   = call i16 @llvm.vp.reduce.add.v8i16(i16 undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+  %V16_vp  = call i16 @llvm.vp.reduce.add.v16i16(i16 undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+  %V32_vp  = call i16 @llvm.vp.reduce.add.v32i16(i16 undef, <32 x i16> undef, <32 x i1> undef, i32 undef)
+  %V64_vp  = call i16 @llvm.vp.reduce.add.v64i16(i16 undef, <64 x i16> undef, <64 x i1> undef, i32 undef)
+  %V128_vp = call i16 @llvm.vp.reduce.add.v128i16(i16 undef, <128 x i16> undef, <128 x i1> undef, i32 undef)
   ret i32 undef
 }
 
@@ -116,6 +193,14 @@ define i32 @reduce_i32(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost o...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/114184