[llvm] [RISCV][CostModel] Recommit VPIntrinsics have same cost as their non-vp counterparts (PR #68752)
Michael Maitland via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 10 16:10:32 PDT 2023
https://github.com/michaelmaitland created https://github.com/llvm/llvm-project/pull/68752
This was reverted in commit 0abaf3caee88ae74def2c7000aff8e61b24634bb (#67178).
This version of the patch includes a fix which was caused by
vp-reductions having an extra start value argument which the non-vp
counterparts did not have.
There was a bug that occured due to VPReduction having a start value
argument that the non-vp counterpart did not have. Once this bug was
fixed, I discovered that non-vp reductions get costed using
Intrinsic::not_intrinsic. This should be fixed by a future patch, but
for now the VP version of the intrinsic is costed in the same way.
>From bc5bdeac46b8295f3a43a033dafa6061c6e82814 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 10 Oct 2023 10:35:01 -0700
Subject: [PATCH 1/2] [RISCV][CostModel] Recommit VPIntrinsics have same cost
as their non-vp counterparts
This was reverted in commit 0abaf3caee88ae74def2c7000aff8e61b24634bb (#67178).
This version of the patch includes a fix which was caused by
vp-reductions having an extra start value argument which the non-vp
counterparts did not have.
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 56 +++
llvm/test/Analysis/CostModel/RISCV/gep.ll | 8 +-
.../CostModel/RISCV/rvv-intrinsics.ll | 370 +++++++++++++++++-
3 files changed, 429 insertions(+), 5 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 3dd16dafe3c42a7..2d0d10982aff6f5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1691,6 +1691,62 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
}
+ // VP Intrinsics should have the same cost as their non-vp counterpart.
+ // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
+ // counterpart when the vector length argument is smaller than the maximum
+ // vector length.
+ if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
+ std::optional<unsigned> FOp =
+ VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
+ if (FOp) {
+ // TODO: Support other kinds of Intrinsics (i.e. reductions)
+ if (ICA.getID() == Intrinsic::vp_load) {
+ Align Alignment;
+ if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
+ Alignment = VPI->getPointerAlignment().valueOrOne();
+ unsigned AS = 0;
+ if (ICA.getArgs().size() > 1)
+ if (auto *PtrTy =
+ dyn_cast<PointerType>(ICA.getArgs()[0]->getType()))
+ AS = PtrTy->getAddressSpace();
+ return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
+ AS, CostKind);
+ }
+ if (ICA.getID() == Intrinsic::vp_store) {
+ Align Alignment;
+ if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
+ Alignment = VPI->getPointerAlignment().valueOrOne();
+ unsigned AS = 0;
+ if (ICA.getArgs().size() >= 2)
+ if (auto *PtrTy =
+ dyn_cast<PointerType>(ICA.getArgs()[1]->getType()))
+ AS = PtrTy->getAddressSpace();
+ return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment,
+ AS, CostKind);
+ }
+ if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) {
+ return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
+ CostKind);
+ }
+ }
+
+ std::optional<Intrinsic::ID> FID =
+ VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID());
+ if (FID) {
+ // Non-vp version will have same Args/Tys except mask and vector length.
+ assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 &&
+ "Expected VPIntrinsic to have Mask and Vector Length args and "
+ "types");
+ ArrayRef<const Value *> NewArgs = ArrayRef(ICA.getArgs()).drop_back(2);
+ ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2);
+
+ IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewArgs,
+ NewTys, ICA.getFlags(), ICA.getInst(),
+ ICA.getScalarizationCost());
+ return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
+ }
+ }
+
// Assume that we need to scalarize this intrinsic.
// Compute the scalarization overhead based on Args for a vector
// intrinsic.
diff --git a/llvm/test/Analysis/CostModel/RISCV/gep.ll b/llvm/test/Analysis/CostModel/RISCV/gep.ll
index be518faf7e05165..4fadf34c1973f83 100644
--- a/llvm/test/Analysis/CostModel/RISCV/gep.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/gep.ll
@@ -270,7 +270,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = getelementptr i8, ptr %base, i32 42
; RVI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
+; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = getelementptr i8, ptr %base, i32 42
; RVI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %x6 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %6, i64 undef, <2 x i1> undef, i32 undef)
; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = getelementptr i8, ptr %base, i32 42
@@ -282,7 +282,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = getelementptr i8, ptr %base, i32 42
; RVI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
+; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = getelementptr i8, ptr %base, i32 42
; RVI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> undef, ptr %12, i64 undef, <2 x i1> undef, i32 undef)
; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -340,7 +340,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %4 = getelementptr i8, ptr %base, i32 0
; RVI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %5 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
+; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %6 = getelementptr i8, ptr %base, i32 0
; RVI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %x6 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %6, i64 undef, <2 x i1> undef, i32 undef)
; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %7 = getelementptr i8, ptr %base, i32 0
@@ -352,7 +352,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %10 = getelementptr i8, ptr %base, i32 0
; RVI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %11 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
+; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %12 = getelementptr i8, ptr %base, i32 0
; RVI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> undef, ptr %12, i64 undef, <2 x i1> undef, i32 undef)
; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
index 93de623cf1c6da9..85364c935267d24 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
@@ -206,10 +206,378 @@ define void @vp_fshl() {
ret void
}
+define void @add() {
+; CHECK-LABEL: 'add'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.add.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t1 = add <2 x i8> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t3 = add <4 x i8> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.add.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t5 = add <8 x i8> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.add.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t7 = add <16 x i8> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t9 = add <2 x i64> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t12 = add <4 x i64> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t13 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t14 = add <8 x i64> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t15 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t16 = add <16 x i64> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t18 = add <vscale x 2 x i8> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t19 = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t20 = add <vscale x 4 x i8> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t21 = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t22 = add <vscale x 8 x i8> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t23 = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t24 = add <vscale x 16 x i8> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t26 = add <vscale x 2 x i64> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t28 = add <vscale x 4 x i64> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t30 = add <vscale x 8 x i64> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.vp.add.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t32 = add <vscale x 16 x i64> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ %t0 = call <2 x i8> @llvm.vp.add.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+ %t1 = add <2 x i8> undef, undef
+ %t2 = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+ %t3 = add <4 x i8> undef, undef
+ %t4 = call <8 x i8> @llvm.vp.add.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+ %t5 = add <8 x i8> undef, undef
+ %t6 = call <16 x i8> @llvm.vp.add.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+ %t7 = add <16 x i8> undef, undef
+ %t8 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+ %t9 = add <2 x i64> undef, undef
+ %t10 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+ %t12 = add <4 x i64> undef, undef
+ %t13 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+ %t14 = add <8 x i64> undef, undef
+ %t15 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+ %t16 = add <16 x i64> undef, undef
+ %t17 = call <vscale x 2 x i8> @llvm.vp.add.nv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+ %t18 = add <vscale x 2 x i8> undef, undef
+ %t19 = call <vscale x 4 x i8> @llvm.vp.add.nv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+ %t20 = add <vscale x 4 x i8> undef, undef
+ %t21 = call <vscale x 8 x i8> @llvm.vp.add.nv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+ %t22 = add <vscale x 8 x i8> undef, undef
+ %t23 = call <vscale x 16 x i8> @llvm.vp.add.nv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+ %t24 = add <vscale x 16 x i8> undef, undef
+ %t25 = call <vscale x 2 x i64> @llvm.vp.add.nv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+ %t26 = add <vscale x 2 x i64> undef, undef
+ %t27 = call <vscale x 4 x i64> @llvm.vp.add.nv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+ %t28 = add <vscale x 4 x i64> undef, undef
+ %t29 = call <vscale x 8 x i64> @llvm.vp.add.nv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+ %t30 = add <vscale x 8 x i64> undef, undef
+ %t31 = call <vscale x 16 x i64> @llvm.vp.add.nv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+ %t32 = add <vscale x 16 x i64> undef, undef
+ ret void
+}
+
+define void @abs() {
+; CHECK-LABEL: 'abs'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 false, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 false, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <8 x i8> @llvm.vp.abs.v8i8(<8 x i8> undef, i1 false, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call <16 x i8> @llvm.vp.abs.v16i8(<16 x i8> undef, i1 false, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 2 x i8> @llvm.vp.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call <vscale x 2 x i8> @llvm.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call <vscale x 4 x i8> @llvm.vp.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call <vscale x 4 x i8> @llvm.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %21 = call <vscale x 8 x i8> @llvm.vp.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %22 = call <vscale x 8 x i8> @llvm.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %24 = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %25 = call <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %30 = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %31 = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %32 = call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 0, <2 x i1> undef, i32 undef)
+ call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 0)
+ call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 0, <4 x i1> undef, i32 undef)
+ call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 0)
+ call <8 x i8> @llvm.vp.abs.v8i8(<8 x i8> undef, i1 0, <8 x i1> undef, i32 undef)
+ call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 0)
+ call <16 x i8> @llvm.vp.abs.v16i8(<16 x i8> undef, i1 0, <16 x i1> undef, i32 undef)
+ call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 0)
+ call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 0, <2 x i1> undef, i32 undef)
+ call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 0)
+ call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 0, <4 x i1> undef, i32 undef)
+ call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 0)
+ call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 0, <8 x i1> undef, i32 undef)
+ call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 0)
+ call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 0, <16 x i1> undef, i32 undef)
+ call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 0)
+ call <vscale x 2 x i8> @llvm.vp.abs.nv2i8(<vscale x 2 x i8> undef, i1 0, <vscale x 2 x i1> undef, i32 undef)
+ call <vscale x 2 x i8> @llvm.abs.nv2i8(<vscale x 2 x i8> undef, i1 0)
+ call <vscale x 4 x i8> @llvm.vp.abs.nv4i8(<vscale x 4 x i8> undef, i1 0, <vscale x 4 x i1> undef, i32 undef)
+ call <vscale x 4 x i8> @llvm.abs.nv4i8(<vscale x 4 x i8> undef, i1 0)
+ call <vscale x 8 x i8> @llvm.vp.abs.nv8i8(<vscale x 8 x i8> undef, i1 0, <vscale x 8 x i1> undef, i32 undef)
+ call <vscale x 8 x i8> @llvm.abs.nv8i8(<vscale x 8 x i8> undef, i1 0)
+ call <vscale x 16 x i8> @llvm.vp.abs.nv16i8(<vscale x 16 x i8> undef, i1 0, <vscale x 16 x i1> undef, i32 undef)
+ call <vscale x 16 x i8> @llvm.abs.nv16i8(<vscale x 16 x i8> undef, i1 0)
+ call <vscale x 2 x i64> @llvm.vp.abs.nv2i64(<vscale x 2 x i64> undef, i1 0, <vscale x 2 x i1> undef, i32 undef)
+ call <vscale x 2 x i64> @llvm.abs.nv2i64(<vscale x 2 x i64> undef, i1 0)
+ call <vscale x 4 x i64> @llvm.vp.abs.nv4i64(<vscale x 4 x i64> undef, i1 0, <vscale x 4 x i1> undef, i32 undef)
+ call <vscale x 4 x i64> @llvm.abs.nv4i64(<vscale x 4 x i64> undef, i1 0)
+ call <vscale x 8 x i64> @llvm.vp.abs.nv8i64(<vscale x 8 x i64> undef, i1 0, <vscale x 8 x i1> undef, i32 undef)
+ call <vscale x 8 x i64> @llvm.abs.nv8i64(<vscale x 8 x i64> undef, i1 0)
+ call <vscale x 16 x i64> @llvm.vp.abs.nv16i64(<vscale x 16 x i64> undef, i1 0, <vscale x 16 x i1> undef, i32 undef)
+ call <vscale x 16 x i64> @llvm.abs.nv16i64(<vscale x 16 x i64> undef, i1 0)
+ ret void
+}
+
+define void @load() {
+; CHECK-LABEL: 'load'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t1 = load <2 x i8>, ptr undef, align 2
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.load.v4i8.p0(ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t3 = load <4 x i8>, ptr undef, align 4
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t5 = load <8 x i8>, ptr undef, align 8
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.load.v16i8.p0(ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t7 = load <16 x i8>, ptr undef, align 16
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t9 = load <2 x i64>, ptr undef, align 16
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.load.v4i64.p0(ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t12 = load <4 x i64>, ptr undef, align 32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t13 = call <8 x i64> @llvm.vp.load.v8i64.p0(ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t14 = load <8 x i64>, ptr undef, align 64
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t15 = call <16 x i64> @llvm.vp.load.v16i64.p0(ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t16 = load <16 x i64>, ptr undef, align 128
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x i8> @llvm.vp.load.nxv2i8.p0(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t18 = load <vscale x 2 x i8>, ptr undef, align 2
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t19 = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t20 = load <vscale x 4 x i8>, ptr undef, align 4
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t21 = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t22 = load <vscale x 8 x i8>, ptr undef, align 8
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t23 = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t24 = load <vscale x 16 x i8>, ptr undef, align 16
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t26 = load <vscale x 2 x i64>, ptr undef, align 16
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t28 = load <vscale x 4 x i64>, ptr undef, align 32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t30 = load <vscale x 8 x i64>, ptr undef, align 64
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.vp.load.nxv16i64.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t32 = load <vscale x 16 x i64>, ptr undef, align 128
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ %t0 = call <2 x i8> @llvm.vp.load.v2i8(ptr undef, <2 x i1> undef, i32 undef)
+ %t1 = load <2 x i8>, ptr undef
+ %t2 = call <4 x i8> @llvm.vp.load.v4i8(ptr undef, <4 x i1> undef, i32 undef)
+ %t3 = load <4 x i8>, ptr undef
+ %t4 = call <8 x i8> @llvm.vp.load.v8i8(ptr undef, <8 x i1> undef, i32 undef)
+ %t5 = load <8 x i8>, ptr undef
+ %t6 = call <16 x i8> @llvm.vp.load.v16i8(ptr undef, <16 x i1> undef, i32 undef)
+ %t7 = load <16 x i8>, ptr undef
+ %t8 = call <2 x i64> @llvm.vp.load.v2i64(ptr undef, <2 x i1> undef, i32 undef)
+ %t9 = load <2 x i64>, ptr undef
+ %t10 = call <4 x i64> @llvm.vp.load.v4i64(ptr undef, <4 x i1> undef, i32 undef)
+ %t12 = load <4 x i64>, ptr undef
+ %t13 = call <8 x i64> @llvm.vp.load.v8i64(ptr undef, <8 x i1> undef, i32 undef)
+ %t14 = load <8 x i64>, ptr undef
+ %t15 = call <16 x i64> @llvm.vp.load.v16i64(ptr undef, <16 x i1> undef, i32 undef)
+ %t16 = load <16 x i64>, ptr undef
+ %t17 = call <vscale x 2 x i8> @llvm.vp.load.nv2i8(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+ %t18 = load <vscale x 2 x i8>, ptr undef
+ %t19 = call <vscale x 4 x i8> @llvm.vp.load.nv4i8(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+ %t20 = load <vscale x 4 x i8>, ptr undef
+ %t21 = call <vscale x 8 x i8> @llvm.vp.load.nv8i8(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+ %t22 = load <vscale x 8 x i8>, ptr undef
+ %t23 = call <vscale x 16 x i8> @llvm.vp.load.nv16i8(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+ %t24 = load <vscale x 16 x i8>, ptr undef
+ %t25 = call <vscale x 2 x i64> @llvm.vp.load.nv2i64(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+ %t26 = load <vscale x 2 x i64>, ptr undef
+ %t27 = call <vscale x 4 x i64> @llvm.vp.load.nv4i64(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+ %t28 = load <vscale x 4 x i64>, ptr undef
+ %t29 = call <vscale x 8 x i64> @llvm.vp.load.nv8i64(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+ %t30 = load <vscale x 8 x i64>, ptr undef
+ %t31 = call <vscale x 16 x i64> @llvm.vp.load.nv16i64(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+ %t32 = load <vscale x 16 x i64>, ptr undef
+ ret void
+}
+
+define void @store() {
+; CHECK-LABEL: 'store'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 2
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v4i8.p0(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, ptr undef, align 4
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> undef, ptr undef, align 8
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 16
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 16
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr undef, align 32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.store.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <8 x i64> undef, ptr undef, align 64
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.store.v16i64.p0(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <16 x i64> undef, ptr undef, align 128
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv2i8.p0(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i8> undef, ptr undef, align 2
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i8> undef, ptr undef, align 4
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 8 x i8> undef, ptr undef, align 8
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 16 x i8> undef, ptr undef, align 16
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 2 x i64> undef, ptr undef, align 16
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <vscale x 4 x i64> undef, ptr undef, align 32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <vscale x 8 x i64> undef, ptr undef, align 64
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.nxv16i64.p0(<vscale x 16 x i64> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <vscale x 16 x i64> undef, ptr undef, align 128
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call void @llvm.vp.store.v2i8(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef)
+ store <2 x i8> undef, ptr undef
+ call void @llvm.vp.store.v4i8(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef)
+ store <4 x i8> undef, ptr undef
+ call void @llvm.vp.store.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef, i32 undef)
+ store <8 x i8> undef, ptr undef
+ call void @llvm.vp.store.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef, i32 undef)
+ store <16 x i8> undef, ptr undef
+ call void @llvm.vp.store.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef, i32 undef)
+ store <2 x i64> undef, ptr undef
+ call void @llvm.vp.store.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef, i32 undef)
+ store <4 x i64> undef, ptr undef
+ call void @llvm.vp.store.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef, i32 undef)
+ store <8 x i64> undef, ptr undef
+ call void @llvm.vp.store.v16i64(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef)
+ store <16 x i64> undef, ptr undef
+ call void @llvm.vp.store.nv2i8(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+ store <vscale x 2 x i8> undef, ptr undef
+ call void @llvm.vp.store.nv4i8(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+ store <vscale x 4 x i8> undef, ptr undef
+ call void @llvm.vp.store.nv8i8(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+ store <vscale x 8 x i8> undef, ptr undef
+ call void @llvm.vp.store.nv16i8(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+ store <vscale x 16 x i8> undef, ptr undef
+ call void @llvm.vp.store.nv2i64(<vscale x 2 x i64> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+ store <vscale x 2 x i64> undef, ptr undef
+ call void @llvm.vp.store.nv4i64(<vscale x 4 x i64> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+ store <vscale x 4 x i64> undef, ptr undef
+ call void @llvm.vp.store.nv8i64(<vscale x 8 x i64> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+ store <vscale x 8 x i64> undef, ptr undef
+ call void @llvm.vp.store.nv16i64(<vscale x 16 x i64> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+ store <vscale x 16 x i64> undef, ptr undef
+ ret void
+}
+
+declare <2 x i8> @llvm.vp.add.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+declare <4 x i8> @llvm.vp.add.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+declare <8 x i8> @llvm.vp.add.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+declare <16 x i8> @llvm.vp.add.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+declare <2 x i64> @llvm.vp.add.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+declare <4 x i64> @llvm.vp.add.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+declare <8 x i64> @llvm.vp.add.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+declare <16 x i64> @llvm.vp.add.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.add.nv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.add.nv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.add.nv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.add.nv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.add.nv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.add.nv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.add.nv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i64> @llvm.vp.add.nv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, <vscale x 16 x i1>, i32)
+
+declare <2 x i8> @llvm.vp.abs.v2i8(<2 x i8>, i1, <2 x i1>, i32)
+declare <4 x i8> @llvm.vp.abs.v4i8(<4 x i8>, i1, <4 x i1>, i32)
+declare <8 x i8> @llvm.vp.abs.v8i8(<8 x i8>, i1, <8 x i1>, i32)
+declare <16 x i8> @llvm.vp.abs.v16i8(<16 x i8>, i1, <16 x i1>, i32)
+declare <2 x i64> @llvm.vp.abs.v2i64(<2 x i64>, i1, <2 x i1>, i32)
+declare <4 x i64> @llvm.vp.abs.v4i64(<4 x i64>, i1, <4 x i1>, i32)
+declare <8 x i64> @llvm.vp.abs.v8i64(<8 x i64>, i1, <8 x i1>, i32)
+declare <16 x i64> @llvm.vp.abs.v16i64(<16 x i64>, i1, <16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.abs.nv2i8(<vscale x 2 x i8>, i1, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.abs.nv4i8(<vscale x 4 x i8>, i1, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.abs.nv8i8(<vscale x 8 x i8>, i1, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.abs.nv16i8(<vscale x 16 x i8>, i1, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.abs.nv2i64(<vscale x 2 x i64>, i1, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.abs.nv4i64(<vscale x 4 x i64>, i1, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.abs.nv8i64(<vscale x 8 x i64>, i1, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i64> @llvm.vp.abs.nv16i64(<vscale x 16 x i64>, i1, <vscale x 16 x i1>, i32)
+
+declare <2 x i8> @llvm.abs.v2i8(<2 x i8>, i1)
+declare <4 x i8> @llvm.abs.v4i8(<4 x i8>, i1)
+declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1)
+declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)
+declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1)
+declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
+declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
+declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
+declare <vscale x 2 x i8> @llvm.abs.nv2i8(<vscale x 2 x i8>, i1)
+declare <vscale x 4 x i8> @llvm.abs.nv4i8(<vscale x 4 x i8>, i1)
+declare <vscale x 8 x i8> @llvm.abs.nv8i8(<vscale x 8 x i8>, i1)
+declare <vscale x 16 x i8> @llvm.abs.nv16i8(<vscale x 16 x i8>, i1)
+declare <vscale x 2 x i64> @llvm.abs.nv2i64(<vscale x 2 x i64>, i1)
+declare <vscale x 4 x i64> @llvm.abs.nv4i64(<vscale x 4 x i64>, i1)
+declare <vscale x 8 x i64> @llvm.abs.nv8i64(<vscale x 8 x i64>, i1)
+declare <vscale x 16 x i64> @llvm.abs.nv16i64(<vscale x 16 x i64>, i1)
+
+declare <2 x i8> @llvm.vp.load.v2i8(ptr, <2 x i1>, i32)
+declare <4 x i8> @llvm.vp.load.v4i8(ptr, <4 x i1>, i32)
+declare <8 x i8> @llvm.vp.load.v8i8(ptr, <8 x i1>, i32)
+declare <16 x i8> @llvm.vp.load.v16i8(ptr, <16 x i1>, i32)
+declare <2 x i64> @llvm.vp.load.v2i64(ptr, <2 x i1>, i32)
+declare <4 x i64> @llvm.vp.load.v4i64(ptr, <4 x i1>, i32)
+declare <8 x i64> @llvm.vp.load.v8i64(ptr, <8 x i1>, i32)
+declare <16 x i64> @llvm.vp.load.v16i64(ptr, <16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.load.nv2i8(ptr, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.load.nv4i8(ptr, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.load.nv8i8(ptr, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.load.nv16i8(ptr, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.load.nv2i64(ptr, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.load.nv4i64(ptr, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.load.nv8i64(ptr, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i64> @llvm.vp.load.nv16i64(ptr, <vscale x 16 x i1>, i32)
+
+declare void @llvm.vp.store.v2i8(<2 x i8>, ptr, <2 x i1>, i32)
+declare void @llvm.vp.store.v4i8(<4 x i8>, ptr, <4 x i1>, i32)
+declare void @llvm.vp.store.v8i8(<8 x i8>, ptr, <8 x i1>, i32)
+declare void @llvm.vp.store.v16i8(<16 x i8>, ptr, <16 x i1>, i32)
+declare void @llvm.vp.store.v2i64(<2 x i64>, ptr, <2 x i1>, i32)
+declare void @llvm.vp.store.v4i64(<4 x i64>, ptr, <4 x i1>, i32)
+declare void @llvm.vp.store.v8i64(<8 x i64>, ptr, <8 x i1>, i32)
+declare void @llvm.vp.store.v16i64(<16 x i64>, ptr, <16 x i1>, i32)
+declare void @llvm.vp.store.nv2i8(<vscale x 2 x i8>, ptr, <vscale x 2 x i1>, i32)
+declare void @llvm.vp.store.nv4i8(<vscale x 4 x i8>, ptr, <vscale x 4 x i1>, i32)
+declare void @llvm.vp.store.nv8i8(<vscale x 8 x i8>, ptr, <vscale x 8 x i1>, i32)
+declare void @llvm.vp.store.nv16i8(<vscale x 16 x i8>, ptr, <vscale x 16 x i1>, i32)
+declare void @llvm.vp.store.nv2i64(<vscale x 2 x i64>, ptr, <vscale x 2 x i1>, i32)
+declare void @llvm.vp.store.nv4i64(<vscale x 4 x i64>, ptr, <vscale x 4 x i1>, i32)
+declare void @llvm.vp.store.nv8i64(<vscale x 8 x i64>, ptr, <vscale x 8 x i1>, i32)
+declare void @llvm.vp.store.nv16i64(<vscale x 16 x i64>, ptr, <vscale x 16 x i1>, i32)
+
declare <vscale x 1 x i32> @llvm.fshr.nxv4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
declare <vscale x 1 x i32> @llvm.fshl.nxv4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
-
declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
declare <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float>, i32)
declare <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float>)
>From 18fa7ac3359093b1737541532a5db0b67e9fa708 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 10 Oct 2023 12:49:07 -0700
Subject: [PATCH 2/2] [CostModel] Cost VPReductions as non-vp counterpart
There was a bug that occured due to VPReduction having a start value
argument that the non-vp counterpart did not have. Once this bug was
fixed, I discovered that non-vp reductions get costed using
Intrinsic::not_intrinsic. This should be fixed by a future patch, but
for now the VP version of the intrinsic is costed in the same way.
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 27 +-
.../CostModel/RISCV/rvv-intrinsics.ll | 370 ++++++++++++++----
2 files changed, 311 insertions(+), 86 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 2d0d10982aff6f5..f87bfa1f5ee28c7 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1695,11 +1695,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
// counterpart when the vector length argument is smaller than the maximum
// vector length.
+ // TODO: Support other kinds of VPIntrinsics
if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
std::optional<unsigned> FOp =
VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
if (FOp) {
- // TODO: Support other kinds of Intrinsics (i.e. reductions)
if (ICA.getID() == Intrinsic::vp_load) {
Align Alignment;
if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
@@ -1737,17 +1737,32 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 &&
"Expected VPIntrinsic to have Mask and Vector Length args and "
"types");
- ArrayRef<const Value *> NewArgs = ArrayRef(ICA.getArgs()).drop_back(2);
ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2);
- IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewArgs,
- NewTys, ICA.getFlags(), ICA.getInst(),
- ICA.getScalarizationCost());
+ // FIXME: it looks like non-vp reductions are costed using the
+ // Intrinsic::not_intrinsic opcode in the cost model. In the future,
+ // they should use the correct intrinsic opcode. The approach for
+ // costing VPIntrinsics is to cost them as their non-vp counterpart so
+ // we use Intrinsic::not_intrinsic below, however this must change when
+ // non-vp reductions use the correct ID.
+ if (VPReductionIntrinsic::isVPReduction(ICA.getID()))
+ FID = std::make_optional<Intrinsic::ID>(Intrinsic::not_intrinsic);
+
+ // VPReduction intrinsics have a start value argument that their non-vp
+ // counterparts do not have, except for the fadd and fmul non-vp
+ // counterpart.
+ if (VPReductionIntrinsic::isVPReduction(ICA.getID()) &&
+ *FID != Intrinsic::vector_reduce_fadd &&
+ *FID != Intrinsic::vector_reduce_fmul)
+ NewTys = NewTys.drop_front();
+
+ IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
+ ICA.getFlags());
return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
}
}
- // Assume that we need to scalarize this intrinsic.
+ // Assume that we need to scalarize this intrinsic.)
// Compute the scalarization overhead based on Args for a vector
// intrinsic.
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
index 85364c935267d24..5707db18cfe92da 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
@@ -258,21 +258,21 @@ define void @add() {
%t14 = add <8 x i64> undef, undef
%t15 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
%t16 = add <16 x i64> undef, undef
- %t17 = call <vscale x 2 x i8> @llvm.vp.add.nv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+ %t17 = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
%t18 = add <vscale x 2 x i8> undef, undef
- %t19 = call <vscale x 4 x i8> @llvm.vp.add.nv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+ %t19 = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
%t20 = add <vscale x 4 x i8> undef, undef
- %t21 = call <vscale x 8 x i8> @llvm.vp.add.nv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+ %t21 = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
%t22 = add <vscale x 8 x i8> undef, undef
- %t23 = call <vscale x 16 x i8> @llvm.vp.add.nv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+ %t23 = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
%t24 = add <vscale x 16 x i8> undef, undef
- %t25 = call <vscale x 2 x i64> @llvm.vp.add.nv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+ %t25 = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
%t26 = add <vscale x 2 x i64> undef, undef
- %t27 = call <vscale x 4 x i64> @llvm.vp.add.nv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+ %t27 = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
%t28 = add <vscale x 4 x i64> undef, undef
- %t29 = call <vscale x 8 x i64> @llvm.vp.add.nv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+ %t29 = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
%t30 = add <vscale x 8 x i64> undef, undef
- %t31 = call <vscale x 16 x i64> @llvm.vp.add.nv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+ %t31 = call <vscale x 16 x i64> @llvm.vp.add.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
%t32 = add <vscale x 16 x i64> undef, undef
ret void
}
@@ -329,22 +329,22 @@ define void @abs() {
call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 0)
call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 0, <16 x i1> undef, i32 undef)
call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 0)
- call <vscale x 2 x i8> @llvm.vp.abs.nv2i8(<vscale x 2 x i8> undef, i1 0, <vscale x 2 x i1> undef, i32 undef)
- call <vscale x 2 x i8> @llvm.abs.nv2i8(<vscale x 2 x i8> undef, i1 0)
- call <vscale x 4 x i8> @llvm.vp.abs.nv4i8(<vscale x 4 x i8> undef, i1 0, <vscale x 4 x i1> undef, i32 undef)
- call <vscale x 4 x i8> @llvm.abs.nv4i8(<vscale x 4 x i8> undef, i1 0)
- call <vscale x 8 x i8> @llvm.vp.abs.nv8i8(<vscale x 8 x i8> undef, i1 0, <vscale x 8 x i1> undef, i32 undef)
- call <vscale x 8 x i8> @llvm.abs.nv8i8(<vscale x 8 x i8> undef, i1 0)
- call <vscale x 16 x i8> @llvm.vp.abs.nv16i8(<vscale x 16 x i8> undef, i1 0, <vscale x 16 x i1> undef, i32 undef)
- call <vscale x 16 x i8> @llvm.abs.nv16i8(<vscale x 16 x i8> undef, i1 0)
- call <vscale x 2 x i64> @llvm.vp.abs.nv2i64(<vscale x 2 x i64> undef, i1 0, <vscale x 2 x i1> undef, i32 undef)
- call <vscale x 2 x i64> @llvm.abs.nv2i64(<vscale x 2 x i64> undef, i1 0)
- call <vscale x 4 x i64> @llvm.vp.abs.nv4i64(<vscale x 4 x i64> undef, i1 0, <vscale x 4 x i1> undef, i32 undef)
- call <vscale x 4 x i64> @llvm.abs.nv4i64(<vscale x 4 x i64> undef, i1 0)
- call <vscale x 8 x i64> @llvm.vp.abs.nv8i64(<vscale x 8 x i64> undef, i1 0, <vscale x 8 x i1> undef, i32 undef)
- call <vscale x 8 x i64> @llvm.abs.nv8i64(<vscale x 8 x i64> undef, i1 0)
- call <vscale x 16 x i64> @llvm.vp.abs.nv16i64(<vscale x 16 x i64> undef, i1 0, <vscale x 16 x i1> undef, i32 undef)
- call <vscale x 16 x i64> @llvm.abs.nv16i64(<vscale x 16 x i64> undef, i1 0)
+ call <vscale x 2 x i8> @llvm.vp.abs.nxv2i8(<vscale x 2 x i8> undef, i1 0, <vscale x 2 x i1> undef, i32 undef)
+ call <vscale x 2 x i8> @llvm.abs.nxv2i8(<vscale x 2 x i8> undef, i1 0)
+ call <vscale x 4 x i8> @llvm.vp.abs.nxv4i8(<vscale x 4 x i8> undef, i1 0, <vscale x 4 x i1> undef, i32 undef)
+ call <vscale x 4 x i8> @llvm.abs.nxv4i8(<vscale x 4 x i8> undef, i1 0)
+ call <vscale x 8 x i8> @llvm.vp.abs.nxv8i8(<vscale x 8 x i8> undef, i1 0, <vscale x 8 x i1> undef, i32 undef)
+ call <vscale x 8 x i8> @llvm.abs.nxv8i8(<vscale x 8 x i8> undef, i1 0)
+ call <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8> undef, i1 0, <vscale x 16 x i1> undef, i32 undef)
+ call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 0)
+ call <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64> undef, i1 0, <vscale x 2 x i1> undef, i32 undef)
+ call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 0)
+ call <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64> undef, i1 0, <vscale x 4 x i1> undef, i32 undef)
+ call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 0)
+ call <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64> undef, i1 0, <vscale x 8 x i1> undef, i32 undef)
+ call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 0)
+ call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 0, <vscale x 16 x i1> undef, i32 undef)
+ call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 0)
ret void
}
@@ -400,21 +400,21 @@ define void @load() {
%t14 = load <8 x i64>, ptr undef
%t15 = call <16 x i64> @llvm.vp.load.v16i64(ptr undef, <16 x i1> undef, i32 undef)
%t16 = load <16 x i64>, ptr undef
- %t17 = call <vscale x 2 x i8> @llvm.vp.load.nv2i8(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+ %t17 = call <vscale x 2 x i8> @llvm.vp.load.nxv2i8(ptr undef, <vscale x 2 x i1> undef, i32 undef)
%t18 = load <vscale x 2 x i8>, ptr undef
- %t19 = call <vscale x 4 x i8> @llvm.vp.load.nv4i8(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+ %t19 = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8(ptr undef, <vscale x 4 x i1> undef, i32 undef)
%t20 = load <vscale x 4 x i8>, ptr undef
- %t21 = call <vscale x 8 x i8> @llvm.vp.load.nv8i8(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+ %t21 = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8(ptr undef, <vscale x 8 x i1> undef, i32 undef)
%t22 = load <vscale x 8 x i8>, ptr undef
- %t23 = call <vscale x 16 x i8> @llvm.vp.load.nv16i8(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+ %t23 = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8(ptr undef, <vscale x 16 x i1> undef, i32 undef)
%t24 = load <vscale x 16 x i8>, ptr undef
- %t25 = call <vscale x 2 x i64> @llvm.vp.load.nv2i64(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+ %t25 = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64(ptr undef, <vscale x 2 x i1> undef, i32 undef)
%t26 = load <vscale x 2 x i64>, ptr undef
- %t27 = call <vscale x 4 x i64> @llvm.vp.load.nv4i64(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+ %t27 = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64(ptr undef, <vscale x 4 x i1> undef, i32 undef)
%t28 = load <vscale x 4 x i64>, ptr undef
- %t29 = call <vscale x 8 x i64> @llvm.vp.load.nv8i64(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+ %t29 = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64(ptr undef, <vscale x 8 x i1> undef, i32 undef)
%t30 = load <vscale x 8 x i64>, ptr undef
- %t31 = call <vscale x 16 x i64> @llvm.vp.load.nv16i64(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+ %t31 = call <vscale x 16 x i64> @llvm.vp.load.nxv16i64(ptr undef, <vscale x 16 x i1> undef, i32 undef)
%t32 = load <vscale x 16 x i64>, ptr undef
ret void
}
@@ -471,25 +471,167 @@ define void @store() {
store <8 x i64> undef, ptr undef
call void @llvm.vp.store.v16i64(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef)
store <16 x i64> undef, ptr undef
- call void @llvm.vp.store.nv2i8(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+ call void @llvm.vp.store.nxv2i8(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
store <vscale x 2 x i8> undef, ptr undef
- call void @llvm.vp.store.nv4i8(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+ call void @llvm.vp.store.nxv4i8(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
store <vscale x 4 x i8> undef, ptr undef
- call void @llvm.vp.store.nv8i8(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+ call void @llvm.vp.store.nxv8i8(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
store <vscale x 8 x i8> undef, ptr undef
- call void @llvm.vp.store.nv16i8(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+ call void @llvm.vp.store.nxv16i8(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
store <vscale x 16 x i8> undef, ptr undef
- call void @llvm.vp.store.nv2i64(<vscale x 2 x i64> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+ call void @llvm.vp.store.nxv2i64(<vscale x 2 x i64> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
store <vscale x 2 x i64> undef, ptr undef
- call void @llvm.vp.store.nv4i64(<vscale x 4 x i64> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+ call void @llvm.vp.store.nxv4i64(<vscale x 4 x i64> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
store <vscale x 4 x i64> undef, ptr undef
- call void @llvm.vp.store.nv8i64(<vscale x 8 x i64> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+ call void @llvm.vp.store.nxv8i64(<vscale x 8 x i64> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
store <vscale x 8 x i64> undef, ptr undef
- call void @llvm.vp.store.nv16i64(<vscale x 16 x i64> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+ call void @llvm.vp.store.nxv16i64(<vscale x 16 x i64> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
store <vscale x 16 x i64> undef, ptr undef
ret void
}
+define void @reduce_add() {
+; CHECK-LABEL: 'reduce_add'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %1 = call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %2 = call i8 @llvm.reduce.add.v2i8(<2 x i8> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %3 = call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %4 = call i8 @llvm.reduce.add.v4i8(<4 x i8> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %5 = call i8 @llvm.vp.reduce.add.v8i8(i8 undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %6 = call i8 @llvm.reduce.add.v8i8(<8 x i8> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %7 = call i8 @llvm.vp.reduce.add.v16i8(i8 undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %8 = call i8 @llvm.reduce.add.v16i8(<16 x i8> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %9 = call i64 @llvm.vp.reduce.add.v2i64(i64 undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %10 = call i64 @llvm.reduce.add.v2i64(<2 x i64> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %11 = call i64 @llvm.vp.reduce.add.v4i64(i64 undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %12 = call i64 @llvm.reduce.add.v4i64(<4 x i64> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %13 = call i64 @llvm.vp.reduce.add.v8i64(i64 undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %14 = call i64 @llvm.reduce.add.v8i64(<8 x i64> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %15 = call i64 @llvm.vp.reduce.add.v16i64(i64 undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %16 = call i64 @llvm.reduce.add.v16i64(<16 x i64> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call i8 @llvm.reduce.add.nxv2i8(<vscale x 2 x i8> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call i8 @llvm.vp.reduce.add.nxv4i8(i8 undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call i8 @llvm.reduce.add.nxv4i8(<vscale x 4 x i8> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %21 = call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %22 = call i8 @llvm.reduce.add.nxv8i8(<vscale x 8 x i8> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %23 = call i8 @llvm.vp.reduce.add.nxv16i8(i8 undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %24 = call i8 @llvm.reduce.add.nxv16i8(<vscale x 16 x i8> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %25 = call i64 @llvm.vp.reduce.add.nxv2i64(i64 undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call i64 @llvm.reduce.add.nxv2i64(<vscale x 2 x i64> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call i64 @llvm.vp.reduce.add.nxv4i64(i64 undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call i64 @llvm.reduce.add.nxv4i64(<vscale x 4 x i64> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call i64 @llvm.vp.reduce.add.nxv8i64(i64 undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %30 = call i64 @llvm.reduce.add.nxv8i64(<vscale x 8 x i64> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %31 = call i64 @llvm.vp.reduce.add.nxv16i64(i64 undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %32 = call i64 @llvm.reduce.add.nxv16i64(<vscale x 16 x i64> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+ call i8 @llvm.reduce.add.v2i8(<2 x i8> undef)
+ call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+ call i8 @llvm.reduce.add.v4i8(<4 x i8> undef)
+ call i8 @llvm.vp.reduce.add.v8i8(i8 undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+ call i8 @llvm.reduce.add.v8i8(<8 x i8> undef)
+ call i8 @llvm.vp.reduce.add.v16i8(i8 undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+ call i8 @llvm.reduce.add.v16i8(<16 x i8> undef)
+ call i64 @llvm.vp.reduce.add.v2i64(i64 undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+ call i64 @llvm.reduce.add.v2i64(<2 x i64> undef)
+ call i64 @llvm.vp.reduce.add.v4i64(i64 undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+ call i64 @llvm.reduce.add.v4i64(<4 x i64> undef)
+ call i64 @llvm.vp.reduce.add.v8i64(i64 undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+ call i64 @llvm.reduce.add.v8i64(<8 x i64> undef)
+ call i64 @llvm.vp.reduce.add.v16i64(i64 undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+ call i64 @llvm.reduce.add.v16i64(<16 x i64> undef)
+ call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+ call i8 @llvm.reduce.add.nxv2i8(<vscale x 2 x i8> undef)
+ call i8 @llvm.vp.reduce.add.nxv4i8(i8 undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+ call i8 @llvm.reduce.add.nxv4i8(<vscale x 4 x i8> undef)
+ call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+ call i8 @llvm.reduce.add.nxv8i8(<vscale x 8 x i8> undef)
+ call i8 @llvm.vp.reduce.add.nxv16i8(i8 undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+ call i8 @llvm.reduce.add.nxv16i8(<vscale x 16 x i8> undef)
+ call i64 @llvm.vp.reduce.add.nxv2i64(i64 undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+ call i64 @llvm.reduce.add.nxv2i64(<vscale x 2 x i64> undef)
+ call i64 @llvm.vp.reduce.add.nxv4i64(i64 undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+ call i64 @llvm.reduce.add.nxv4i64(<vscale x 4 x i64> undef)
+ call i64 @llvm.vp.reduce.add.nxv8i64(i64 undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+ call i64 @llvm.reduce.add.nxv8i64(<vscale x 8 x i64> undef)
+ call i64 @llvm.vp.reduce.add.nxv16i64(i64 undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+ call i64 @llvm.reduce.add.nxv16i64(<vscale x 16 x i64> undef)
+ ret void
+}
+
+define void @reduce_fadd() {
+; CHECK-LABEL: 'reduce_fadd'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %1 = call float @llvm.vp.reduce.fadd.v2f32(float undef, <2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %2 = call float @llvm.reduce.fadd.v2f32(float undef, <2 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %3 = call float @llvm.vp.reduce.fadd.v4f32(float undef, <4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %4 = call float @llvm.reduce.fadd.v4f32(float undef, <4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %5 = call float @llvm.vp.reduce.fadd.v8f32(float undef, <8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %6 = call float @llvm.reduce.fadd.v8f32(float undef, <8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %7 = call float @llvm.vp.reduce.fadd.v16f32(float undef, <16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %8 = call float @llvm.reduce.fadd.v16f32(float undef, <16 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %9 = call double @llvm.vp.reduce.fadd.v2f64(double undef, <2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %10 = call double @llvm.reduce.fadd.v2f64(double undef, <2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %11 = call double @llvm.vp.reduce.fadd.v4f64(double undef, <4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %12 = call double @llvm.reduce.fadd.v4f64(double undef, <4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %13 = call double @llvm.vp.reduce.fadd.v8f64(double undef, <8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %14 = call double @llvm.reduce.fadd.v8f64(double undef, <8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %15 = call double @llvm.vp.reduce.fadd.v16f64(double undef, <16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %16 = call double @llvm.reduce.fadd.v16f64(double undef, <16 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call float @llvm.vp.reduce.fadd.nxv2f32(float undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call float @llvm.reduce.fadd.nxv2f32(float undef, <vscale x 2 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call float @llvm.vp.reduce.fadd.nxv4f32(float undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call float @llvm.reduce.fadd.nxv4f32(float undef, <vscale x 4 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %21 = call float @llvm.vp.reduce.fadd.nxv8f32(float undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %22 = call float @llvm.reduce.fadd.nxv8f32(float undef, <vscale x 8 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %23 = call float @llvm.vp.reduce.fadd.nxv16f32(float undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %24 = call float @llvm.reduce.fadd.nxv16f32(float undef, <vscale x 16 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %25 = call double @llvm.vp.reduce.fadd.nxv2f64(double undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call double @llvm.reduce.fadd.nxv2f64(double undef, <vscale x 2 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call double @llvm.vp.reduce.fadd.nxv4f64(double undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call double @llvm.reduce.fadd.nxv4f64(double undef, <vscale x 4 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call double @llvm.vp.reduce.fadd.nxv8f64(double undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %30 = call double @llvm.reduce.fadd.nxv8f64(double undef, <vscale x 8 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %31 = call double @llvm.vp.reduce.fadd.nxv16f64(double undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %32 = call double @llvm.reduce.fadd.nxv16f64(double undef, <vscale x 16 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call float @llvm.vp.reduce.fadd.v2f32(float undef, <2 x float> undef, <2 x i1> undef, i32 undef)
+ call float @llvm.reduce.fadd.v2f32(float undef, <2 x float> undef)
+ call float @llvm.vp.reduce.fadd.v4f32(float undef, <4 x float> undef, <4 x i1> undef, i32 undef)
+ call float @llvm.reduce.fadd.v4f32(float undef, <4 x float> undef)
+ call float @llvm.vp.reduce.fadd.v8f32(float undef, <8 x float> undef, <8 x i1> undef, i32 undef)
+ call float @llvm.reduce.fadd.v8f32(float undef, <8 x float> undef)
+ call float @llvm.vp.reduce.fadd.v16f32(float undef, <16 x float> undef, <16 x i1> undef, i32 undef)
+ call float @llvm.reduce.fadd.v16f32(float undef, <16 x float> undef)
+ call double @llvm.vp.reduce.fadd.v2f64(double undef, <2 x double> undef, <2 x i1> undef, i32 undef)
+ call double @llvm.reduce.fadd.v2f64(double undef, <2 x double> undef)
+ call double @llvm.vp.reduce.fadd.v4f64(double undef, <4 x double> undef, <4 x i1> undef, i32 undef)
+ call double @llvm.reduce.fadd.v4f64(double undef, <4 x double> undef)
+ call double @llvm.vp.reduce.fadd.v8f64(double undef, <8 x double> undef, <8 x i1> undef, i32 undef)
+ call double @llvm.reduce.fadd.v8f64(double undef, <8 x double> undef)
+ call double @llvm.vp.reduce.fadd.v16f64(double undef, <16 x double> undef, <16 x i1> undef, i32 undef)
+ call double @llvm.reduce.fadd.v16f64(double undef, <16 x double> undef)
+ call float @llvm.vp.reduce.fadd.nxv2f32(float undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+ call float @llvm.reduce.fadd.nxv2f32(float undef, <vscale x 2 x float> undef)
+ call float @llvm.vp.reduce.fadd.nxv4f32(float undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+ call float @llvm.reduce.fadd.nxv4f32(float undef, <vscale x 4 x float> undef)
+ call float @llvm.vp.reduce.fadd.nxv8f32(float undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+ call float @llvm.reduce.fadd.nxv8f32(float undef, <vscale x 8 x float> undef)
+ call float @llvm.vp.reduce.fadd.nxv16f32(float undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+ call float @llvm.reduce.fadd.nxv16f32(float undef, <vscale x 16 x float> undef)
+ call double @llvm.vp.reduce.fadd.nxv2f64(double undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+ call double @llvm.reduce.fadd.nxv2f64(double undef, <vscale x 2 x double> undef)
+ call double @llvm.vp.reduce.fadd.nxv4f64(double undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+ call double @llvm.reduce.fadd.nxv4f64(double undef, <vscale x 4 x double> undef)
+ call double @llvm.vp.reduce.fadd.nxv8f64(double undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+ call double @llvm.reduce.fadd.nxv8f64(double undef, <vscale x 8 x double> undef)
+ call double @llvm.vp.reduce.fadd.nxv16f64(double undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+ call double @llvm.reduce.fadd.nxv16f64(double undef, <vscale x 16 x double> undef)
+ ret void
+}
+
declare <2 x i8> @llvm.vp.add.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
declare <4 x i8> @llvm.vp.add.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
declare <8 x i8> @llvm.vp.add.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
@@ -498,14 +640,14 @@ declare <2 x i64> @llvm.vp.add.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
declare <4 x i64> @llvm.vp.add.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
declare <8 x i64> @llvm.vp.add.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
declare <16 x i64> @llvm.vp.add.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
-declare <vscale x 2 x i8> @llvm.vp.add.nv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i8> @llvm.vp.add.nv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i8> @llvm.vp.add.nv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i8> @llvm.vp.add.nv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
-declare <vscale x 2 x i64> @llvm.vp.add.nv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i64> @llvm.vp.add.nv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i64> @llvm.vp.add.nv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i64> @llvm.vp.add.nv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i64> @llvm.vp.add.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, <vscale x 16 x i1>, i32)
declare <2 x i8> @llvm.vp.abs.v2i8(<2 x i8>, i1, <2 x i1>, i32)
declare <4 x i8> @llvm.vp.abs.v4i8(<4 x i8>, i1, <4 x i1>, i32)
@@ -515,14 +657,14 @@ declare <2 x i64> @llvm.vp.abs.v2i64(<2 x i64>, i1, <2 x i1>, i32)
declare <4 x i64> @llvm.vp.abs.v4i64(<4 x i64>, i1, <4 x i1>, i32)
declare <8 x i64> @llvm.vp.abs.v8i64(<8 x i64>, i1, <8 x i1>, i32)
declare <16 x i64> @llvm.vp.abs.v16i64(<16 x i64>, i1, <16 x i1>, i32)
-declare <vscale x 2 x i8> @llvm.vp.abs.nv2i8(<vscale x 2 x i8>, i1, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i8> @llvm.vp.abs.nv4i8(<vscale x 4 x i8>, i1, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i8> @llvm.vp.abs.nv8i8(<vscale x 8 x i8>, i1, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i8> @llvm.vp.abs.nv16i8(<vscale x 16 x i8>, i1, <vscale x 16 x i1>, i32)
-declare <vscale x 2 x i64> @llvm.vp.abs.nv2i64(<vscale x 2 x i64>, i1, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i64> @llvm.vp.abs.nv4i64(<vscale x 4 x i64>, i1, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i64> @llvm.vp.abs.nv8i64(<vscale x 8 x i64>, i1, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i64> @llvm.vp.abs.nv16i64(<vscale x 16 x i64>, i1, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.abs.nxv2i8(<vscale x 2 x i8>, i1, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.abs.nxv4i8(<vscale x 4 x i8>, i1, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.abs.nxv8i8(<vscale x 8 x i8>, i1, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8>, i1, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64>, i1, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64>, i1, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64>, i1, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64>, i1, <vscale x 16 x i1>, i32)
declare <2 x i8> @llvm.abs.v2i8(<2 x i8>, i1)
declare <4 x i8> @llvm.abs.v4i8(<4 x i8>, i1)
@@ -532,14 +674,14 @@ declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1)
declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
-declare <vscale x 2 x i8> @llvm.abs.nv2i8(<vscale x 2 x i8>, i1)
-declare <vscale x 4 x i8> @llvm.abs.nv4i8(<vscale x 4 x i8>, i1)
-declare <vscale x 8 x i8> @llvm.abs.nv8i8(<vscale x 8 x i8>, i1)
-declare <vscale x 16 x i8> @llvm.abs.nv16i8(<vscale x 16 x i8>, i1)
-declare <vscale x 2 x i64> @llvm.abs.nv2i64(<vscale x 2 x i64>, i1)
-declare <vscale x 4 x i64> @llvm.abs.nv4i64(<vscale x 4 x i64>, i1)
-declare <vscale x 8 x i64> @llvm.abs.nv8i64(<vscale x 8 x i64>, i1)
-declare <vscale x 16 x i64> @llvm.abs.nv16i64(<vscale x 16 x i64>, i1)
+declare <vscale x 2 x i8> @llvm.abs.nxv2i8(<vscale x 2 x i8>, i1)
+declare <vscale x 4 x i8> @llvm.abs.nxv4i8(<vscale x 4 x i8>, i1)
+declare <vscale x 8 x i8> @llvm.abs.nxv8i8(<vscale x 8 x i8>, i1)
+declare <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8>, i1)
+declare <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64>, i1)
+declare <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64>, i1)
+declare <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64>, i1)
+declare <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64>, i1)
declare <2 x i8> @llvm.vp.load.v2i8(ptr, <2 x i1>, i32)
declare <4 x i8> @llvm.vp.load.v4i8(ptr, <4 x i1>, i32)
@@ -549,14 +691,14 @@ declare <2 x i64> @llvm.vp.load.v2i64(ptr, <2 x i1>, i32)
declare <4 x i64> @llvm.vp.load.v4i64(ptr, <4 x i1>, i32)
declare <8 x i64> @llvm.vp.load.v8i64(ptr, <8 x i1>, i32)
declare <16 x i64> @llvm.vp.load.v16i64(ptr, <16 x i1>, i32)
-declare <vscale x 2 x i8> @llvm.vp.load.nv2i8(ptr, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i8> @llvm.vp.load.nv4i8(ptr, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i8> @llvm.vp.load.nv8i8(ptr, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i8> @llvm.vp.load.nv16i8(ptr, <vscale x 16 x i1>, i32)
-declare <vscale x 2 x i64> @llvm.vp.load.nv2i64(ptr, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i64> @llvm.vp.load.nv4i64(ptr, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i64> @llvm.vp.load.nv8i64(ptr, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i64> @llvm.vp.load.nv16i64(ptr, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.load.nxv2i8(ptr, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.load.nxv4i8(ptr, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.load.nxv8i8(ptr, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.load.nxv16i8(ptr, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.load.nxv2i64(ptr, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.load.nxv4i64(ptr, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.load.nxv8i64(ptr, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i64> @llvm.vp.load.nxv16i64(ptr, <vscale x 16 x i1>, i32)
declare void @llvm.vp.store.v2i8(<2 x i8>, ptr, <2 x i1>, i32)
declare void @llvm.vp.store.v4i8(<4 x i8>, ptr, <4 x i1>, i32)
@@ -566,14 +708,82 @@ declare void @llvm.vp.store.v2i64(<2 x i64>, ptr, <2 x i1>, i32)
declare void @llvm.vp.store.v4i64(<4 x i64>, ptr, <4 x i1>, i32)
declare void @llvm.vp.store.v8i64(<8 x i64>, ptr, <8 x i1>, i32)
declare void @llvm.vp.store.v16i64(<16 x i64>, ptr, <16 x i1>, i32)
-declare void @llvm.vp.store.nv2i8(<vscale x 2 x i8>, ptr, <vscale x 2 x i1>, i32)
-declare void @llvm.vp.store.nv4i8(<vscale x 4 x i8>, ptr, <vscale x 4 x i1>, i32)
-declare void @llvm.vp.store.nv8i8(<vscale x 8 x i8>, ptr, <vscale x 8 x i1>, i32)
-declare void @llvm.vp.store.nv16i8(<vscale x 16 x i8>, ptr, <vscale x 16 x i1>, i32)
-declare void @llvm.vp.store.nv2i64(<vscale x 2 x i64>, ptr, <vscale x 2 x i1>, i32)
-declare void @llvm.vp.store.nv4i64(<vscale x 4 x i64>, ptr, <vscale x 4 x i1>, i32)
-declare void @llvm.vp.store.nv8i64(<vscale x 8 x i64>, ptr, <vscale x 8 x i1>, i32)
-declare void @llvm.vp.store.nv16i64(<vscale x 16 x i64>, ptr, <vscale x 16 x i1>, i32)
+declare void @llvm.vp.store.nxv2i8(<vscale x 2 x i8>, ptr, <vscale x 2 x i1>, i32)
+declare void @llvm.vp.store.nxv4i8(<vscale x 4 x i8>, ptr, <vscale x 4 x i1>, i32)
+declare void @llvm.vp.store.nxv8i8(<vscale x 8 x i8>, ptr, <vscale x 8 x i1>, i32)
+declare void @llvm.vp.store.nxv16i8(<vscale x 16 x i8>, ptr, <vscale x 16 x i1>, i32)
+declare void @llvm.vp.store.nxv2i64(<vscale x 2 x i64>, ptr, <vscale x 2 x i1>, i32)
+declare void @llvm.vp.store.nxv4i64(<vscale x 4 x i64>, ptr, <vscale x 4 x i1>, i32)
+declare void @llvm.vp.store.nxv8i64(<vscale x 8 x i64>, ptr, <vscale x 8 x i1>, i32)
+declare void @llvm.vp.store.nxv16i64(<vscale x 16 x i64>, ptr, <vscale x 16 x i1>, i32)
+
+declare i8 @llvm.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.reduce.add.v16i8(<16 x i8>)
+declare i64 @llvm.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.reduce.add.v16i64(<16 x i64>)
+declare i8 @llvm.reduce.add.nxv2i8(<vscale x 2 x i8>)
+declare i8 @llvm.reduce.add.nxv4i8(<vscale x 4 x i8>)
+declare i8 @llvm.reduce.add.nxv8i8(<vscale x 8 x i8>)
+declare i8 @llvm.reduce.add.nxv16i8(<vscale x 16 x i8>)
+declare i64 @llvm.reduce.add.nxv2i64(<vscale x 2 x i64>)
+declare i64 @llvm.reduce.add.nxv4i64(<vscale x 4 x i64>)
+declare i64 @llvm.reduce.add.nxv8i64(<vscale x 8 x i64>)
+declare i64 @llvm.reduce.add.nxv16i64(<vscale x 16 x i64>)
+
+declare i8 @llvm.vp.reduce.add.v2i8(i8, <2 x i8>, <2 x i1>, i32)
+declare i8 @llvm.vp.reduce.add.v4i8(i8, <4 x i8>, <4 x i1>, i32)
+declare i8 @llvm.vp.reduce.add.v8i8(i8, <8 x i8>, <8 x i1>, i32)
+declare i8 @llvm.vp.reduce.add.v16i8(i8, <16 x i8>, <16 x i1>, i32)
+declare i64 @llvm.vp.reduce.add.v2i64(i64, <2 x i64>, <2 x i1>, i32)
+declare i64 @llvm.vp.reduce.add.v4i64(i64, <4 x i64>, <4 x i1>, i32)
+declare i64 @llvm.vp.reduce.add.v8i64(i64, <8 x i64>, <8 x i1>, i32)
+declare i64 @llvm.vp.reduce.add.v16i64(i64, <16 x i64>, <16 x i1>, i32)
+declare i8 @llvm.vp.reduce.add.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+declare i8 @llvm.vp.reduce.add.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+declare i8 @llvm.vp.reduce.add.nxv8i8(i8, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare i8 @llvm.vp.reduce.add.nxv16i8(i8, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+declare i64 @llvm.vp.reduce.add.nxv2i64(i64, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+declare i64 @llvm.vp.reduce.add.nxv4i64(i64, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+declare i64 @llvm.vp.reduce.add.nxv8i64(i64, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+declare i64 @llvm.vp.reduce.add.nxv16i64(i64, <vscale x 16 x i64>, <vscale x 16 x i1>, i32)
+
+declare float @llvm.reduce.fadd.v2f32(float, <2 x float>)
+declare float @llvm.reduce.fadd.v4f32(float, <4 x float>)
+declare float @llvm.reduce.fadd.v8f32(float, <8 x float>)
+declare float @llvm.reduce.fadd.v16f32(float, <16 x float>)
+declare double @llvm.reduce.fadd.v2f64(double, <2 x double>)
+declare double @llvm.reduce.fadd.v4f64(double, <4 x double>)
+declare double @llvm.reduce.fadd.v8f64(double, <8 x double>)
+declare double @llvm.reduce.fadd.v16f64(double, <16 x double>)
+declare float @llvm.reduce.fadd.nxv2f32(float, <vscale x 2 x float>)
+declare float @llvm.reduce.fadd.nxv4f32(float, <vscale x 4 x float>)
+declare float @llvm.reduce.fadd.nxv8f32(float, <vscale x 8 x float>)
+declare float @llvm.reduce.fadd.nxv16f32(float, <vscale x 16 x float>)
+declare double @llvm.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
+declare double @llvm.reduce.fadd.nxv4f64(double, <vscale x 4 x double>)
+declare double @llvm.reduce.fadd.nxv8f64(double, <vscale x 8 x double>)
+declare double @llvm.reduce.fadd.nxv16f64(double, <vscale x 16 x double>)
+
+declare float @llvm.vp.reduce.fadd.v2f32(float, <2 x float>, <2 x i1>, i32)
+declare float @llvm.vp.reduce.fadd.v4f32(float, <4 x float>, <4 x i1>, i32)
+declare float @llvm.vp.reduce.fadd.v8f32(float, <8 x float>, <8 x i1>, i32)
+declare float @llvm.vp.reduce.fadd.v16f32(float, <16 x float>, <16 x i1>, i32)
+declare double @llvm.vp.reduce.fadd.v2f64(double, <2 x double>, <2 x i1>, i32)
+declare double @llvm.vp.reduce.fadd.v4f64(double, <4 x double>, <4 x i1>, i32)
+declare double @llvm.vp.reduce.fadd.v8f64(double, <8 x double>, <8 x i1>, i32)
+declare double @llvm.vp.reduce.fadd.v16f64(double, <16 x double>, <16 x i1>, i32)
+declare float @llvm.vp.reduce.fadd.nxv2f32(float, <vscale x 2 x float>, <vscale x 2 x i1>, i32)
+declare float @llvm.vp.reduce.fadd.nxv4f32(float, <vscale x 4 x float>, <vscale x 4 x i1>, i32)
+declare float @llvm.vp.reduce.fadd.nxv8f32(float, <vscale x 8 x float>, <vscale x 8 x i1>, i32)
+declare float @llvm.vp.reduce.fadd.nxv16f32(float, <vscale x 16 x float>, <vscale x 16 x i1>, i32)
+declare double @llvm.vp.reduce.fadd.nxv2f64(double, <vscale x 2 x double>, <vscale x 2 x i1>, i32)
+declare double @llvm.vp.reduce.fadd.nxv4f64(double, <vscale x 4 x double>, <vscale x 4 x i1>, i32)
+declare double @llvm.vp.reduce.fadd.nxv8f64(double, <vscale x 8 x double>, <vscale x 8 x i1>, i32)
+declare double @llvm.vp.reduce.fadd.nxv16f64(double, <vscale x 16 x double>, <vscale x 16 x i1>, i32)
declare <vscale x 1 x i32> @llvm.fshr.nxv4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
declare <vscale x 1 x i32> @llvm.fshl.nxv4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
More information about the llvm-commits
mailing list