[llvm] fc865c2 - [RISCV][CostModel] VPIntrinsics have same cost as their non-vp counterparts (#67178)

Thu Oct 5 07:10:07 PDT 2023

Author: Michael Maitland
Date: 2023-10-05T10:10:02-04:00
New Revision: fc865c20345860f394448c228054beafc22a1d4d

URL: https://github.com/llvm/llvm-project/commit/fc865c20345860f394448c228054beafc22a1d4d
DIFF: https://github.com/llvm/llvm-project/commit/fc865c20345860f394448c228054beafc22a1d4d.diff

LOG: [RISCV][CostModel] VPIntrinsics have same cost as their non-vp counterparts (#67178)

On RISCV, only a few VPIntrinsics have their cost modeled by the
VectorIntrinsicCostTable. Even so, none of those entries consider LMUL.
All other VPIntrinsics do not have meaningful modeling.

This patch models the cost of a VPIntrinsic as the cost of its non-VP
counterpart. It is possible that the VP Intrinsic is cheaper than the
non-VP
version depending on VL. On RISCV, this may be due two reasons (if the
instruction is part of a loop):
  1. A smaller VL can be used on the last iteration of the loop.
  2. The VP instruction may avoid a scalar remainder loop.

I have left this as a TODO since I think this change puts us on the
right path of modeling the cost of a VPInstruction, and it isn't
entirely clear to me how much of a discount we should give to a
known VL<VLMAX or what to do when VL is unknown at compile time.

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/test/Analysis/CostModel/RISCV/gep.ll
    llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 0f8fb14f7b0e3b2..5d83a0d37dc3a1b 100644

--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1686,6 +1686,62 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     }
     }
 
+    // VP Intrinsics should have the same cost as their non-vp counterpart.
+    // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
+    // counterpart when the vector length argument is smaller than the maximum
+    // vector length.
+    if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
+      std::optional<unsigned> FOp =
+          VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
+      if (FOp) {
+        // TODO: Support other kinds of Intrinsics (i.e. reductions)
+        if (ICA.getID() == Intrinsic::vp_load) {
+          Align Alignment;
+          if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
+            Alignment = VPI->getPointerAlignment().valueOrOne();
+          unsigned AS = 0;
+          if (ICA.getArgs().size() > 1)
+            if (auto *PtrTy =
+                    dyn_cast<PointerType>(ICA.getArgs()[0]->getType()))
+              AS = PtrTy->getAddressSpace();
+          return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
+                                          AS, CostKind);
+        }
+        if (ICA.getID() == Intrinsic::vp_store) {
+          Align Alignment;
+          if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
+            Alignment = VPI->getPointerAlignment().valueOrOne();
+          unsigned AS = 0;
+          if (ICA.getArgs().size() >= 2)
+            if (auto *PtrTy =
+                    dyn_cast<PointerType>(ICA.getArgs()[1]->getType()))
+              AS = PtrTy->getAddressSpace();
+          return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment,
+                                          AS, CostKind);
+        }
+        if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) {
+          return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
+                                                 CostKind);
+        }
+      }
+
+      std::optional<Intrinsic::ID> FID =
+          VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID());
+      if (FID) {
+        // Non-vp version will have same Args/Tys except mask and vector length.
+        assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 &&
+               "Expected VPIntrinsic to have Mask and Vector Length args and "
+               "types");
+        ArrayRef<const Value *> NewArgs = ArrayRef(ICA.getArgs()).drop_back(2);
+        ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2);
+
+        IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewArgs,
+                                       NewTys, ICA.getFlags(), ICA.getInst(),
+                                       ICA.getScalarizationCost());
+        return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
+      }
+    }
+
     // Assume that we need to scalarize this intrinsic.
     // Compute the scalarization overhead based on Args for a vector
     // intrinsic.

diff  --git a/llvm/test/Analysis/CostModel/RISCV/gep.ll b/llvm/test/Analysis/CostModel/RISCV/gep.ll
index be518faf7e05165..4fadf34c1973f83 100644
--- a/llvm/test/Analysis/CostModel/RISCV/gep.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/gep.ll
@@ -270,7 +270,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x6 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %6, i64 undef, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = getelementptr i8, ptr %base, i32 42
@@ -282,7 +282,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> undef, ptr %12, i64 undef, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -340,7 +340,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %4 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %5 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %6 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x6 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %6, i64 undef, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %7 = getelementptr i8, ptr %base, i32 0
@@ -352,7 +352,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %10 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %11 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %12 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> undef, ptr %12, i64 undef, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void

diff  --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
index 93de623cf1c6da9..85364c935267d24 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
@@ -206,10 +206,378 @@ define void @vp_fshl() {
   ret void
 }
 
+define void @add() {
+; CHECK-LABEL: 'add'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.add.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = add <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = add <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.add.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = add <8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.add.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = add <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = add <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t12 = add <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t14 = add <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t16 = add <16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = add <vscale x 2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t20 = add <vscale x 4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t21 = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t22 = add <vscale x 8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t23 = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t24 = add <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = add <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = add <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = add <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.vp.add.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = add <vscale x 16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %t0 = call <2 x i8> @llvm.vp.add.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+  %t1 = add <2 x i8> undef, undef
+  %t2 = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+  %t3 = add <4 x i8> undef, undef
+  %t4 = call <8 x i8> @llvm.vp.add.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+  %t5 = add <8 x i8> undef, undef
+  %t6 = call <16 x i8> @llvm.vp.add.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+  %t7 = add <16 x i8> undef, undef
+  %t8 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+  %t9 = add <2 x i64> undef, undef
+  %t10 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+  %t12 = add <4 x i64> undef, undef
+  %t13 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+  %t14 = add <8 x i64> undef, undef
+  %t15 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+  %t16 = add <16 x i64> undef, undef
+  %t17 = call <vscale x 2 x i8> @llvm.vp.add.nv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t18 = add <vscale x 2 x i8> undef, undef
+  %t19 = call <vscale x 4 x i8> @llvm.vp.add.nv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t20 = add <vscale x 4 x i8> undef, undef
+  %t21 = call <vscale x 8 x i8> @llvm.vp.add.nv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t22 = add <vscale x 8 x i8> undef, undef
+  %t23 = call <vscale x 16 x i8> @llvm.vp.add.nv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t24 = add <vscale x 16 x i8> undef, undef
+  %t25 = call <vscale x 2 x i64> @llvm.vp.add.nv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t26 = add <vscale x 2 x i64> undef, undef
+  %t27 = call <vscale x 4 x i64> @llvm.vp.add.nv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t28 = add <vscale x 4 x i64> undef, undef
+  %t29 = call <vscale x 8 x i64> @llvm.vp.add.nv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t30 = add <vscale x 8 x i64> undef, undef
+  %t31 = call <vscale x 16 x i64> @llvm.vp.add.nv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t32 = add <vscale x 16 x i64> undef, undef
+  ret void
+}
+
+define void @abs() {
+; CHECK-LABEL: 'abs'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 false, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 false, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <8 x i8> @llvm.vp.abs.v8i8(<8 x i8> undef, i1 false, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = call <16 x i8> @llvm.vp.abs.v16i8(<16 x i8> undef, i1 false, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 2 x i8> @llvm.vp.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <vscale x 2 x i8> @llvm.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = call <vscale x 4 x i8> @llvm.vp.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <vscale x 4 x i8> @llvm.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = call <vscale x 8 x i8> @llvm.vp.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <vscale x 8 x i8> @llvm.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = call <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %27 = call <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %31 = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 0, <2 x i1> undef, i32 undef)
+  call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 0)
+  call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 0, <4 x i1> undef, i32 undef)
+  call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 0)
+  call <8 x i8> @llvm.vp.abs.v8i8(<8 x i8> undef, i1 0, <8 x i1> undef, i32 undef)
+  call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 0)
+  call <16 x i8> @llvm.vp.abs.v16i8(<16 x i8> undef, i1 0, <16 x i1> undef, i32 undef)
+  call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 0)
+  call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 0, <2 x i1> undef, i32 undef)
+  call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 0)
+  call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 0, <4 x i1> undef, i32 undef)
+  call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 0)
+  call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 0, <8 x i1> undef, i32 undef)
+  call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 0)
+  call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 0, <16 x i1> undef, i32 undef)
+  call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 0)
+  call <vscale x 2 x i8> @llvm.vp.abs.nv2i8(<vscale x 2 x i8> undef, i1 0, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 2 x i8> @llvm.abs.nv2i8(<vscale x 2 x i8> undef, i1 0)
+  call <vscale x 4 x i8> @llvm.vp.abs.nv4i8(<vscale x 4 x i8> undef, i1 0, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 4 x i8> @llvm.abs.nv4i8(<vscale x 4 x i8> undef, i1 0)
+  call <vscale x 8 x i8> @llvm.vp.abs.nv8i8(<vscale x 8 x i8> undef, i1 0, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 8 x i8> @llvm.abs.nv8i8(<vscale x 8 x i8> undef, i1 0)
+  call <vscale x 16 x i8> @llvm.vp.abs.nv16i8(<vscale x 16 x i8> undef, i1 0, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 16 x i8> @llvm.abs.nv16i8(<vscale x 16 x i8> undef, i1 0)
+  call <vscale x 2 x i64> @llvm.vp.abs.nv2i64(<vscale x 2 x i64> undef, i1 0, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 2 x i64> @llvm.abs.nv2i64(<vscale x 2 x i64> undef, i1 0)
+  call <vscale x 4 x i64> @llvm.vp.abs.nv4i64(<vscale x 4 x i64> undef, i1 0, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 4 x i64> @llvm.abs.nv4i64(<vscale x 4 x i64> undef, i1 0)
+  call <vscale x 8 x i64> @llvm.vp.abs.nv8i64(<vscale x 8 x i64> undef, i1 0, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 8 x i64> @llvm.abs.nv8i64(<vscale x 8 x i64> undef, i1 0)
+  call <vscale x 16 x i64> @llvm.vp.abs.nv16i64(<vscale x 16 x i64> undef, i1 0, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 16 x i64> @llvm.abs.nv16i64(<vscale x 16 x i64> undef, i1 0)
+  ret void
+}
+
+define void @load() {
+; CHECK-LABEL: 'load'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = load <2 x i8>, ptr undef, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.load.v4i8.p0(ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = load <4 x i8>, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = load <8 x i8>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.load.v16i8.p0(ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = load <16 x i8>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = load <2 x i64>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.load.v4i64.p0(ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t12 = load <4 x i64>, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = call <8 x i64> @llvm.vp.load.v8i64.p0(ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t14 = load <8 x i64>, ptr undef, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = call <16 x i64> @llvm.vp.load.v16i64.p0(ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t16 = load <16 x i64>, ptr undef, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x i8> @llvm.vp.load.nxv2i8.p0(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = load <vscale x 2 x i8>, ptr undef, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t20 = load <vscale x 4 x i8>, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t21 = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t22 = load <vscale x 8 x i8>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t23 = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t24 = load <vscale x 16 x i8>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = load <vscale x 2 x i64>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = load <vscale x 4 x i64>, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = load <vscale x 8 x i64>, ptr undef, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.vp.load.nxv16i64.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = load <vscale x 16 x i64>, ptr undef, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %t0 = call <2 x i8> @llvm.vp.load.v2i8(ptr undef, <2 x i1> undef, i32 undef)
+  %t1 = load <2 x i8>, ptr undef
+  %t2 = call <4 x i8> @llvm.vp.load.v4i8(ptr undef, <4 x i1> undef, i32 undef)
+  %t3 = load <4 x i8>, ptr undef
+  %t4 = call <8 x i8> @llvm.vp.load.v8i8(ptr undef, <8 x i1> undef, i32 undef)
+  %t5 = load <8 x i8>, ptr undef
+  %t6 = call <16 x i8> @llvm.vp.load.v16i8(ptr undef, <16 x i1> undef, i32 undef)
+  %t7 = load <16 x i8>, ptr undef
+  %t8 = call <2 x i64> @llvm.vp.load.v2i64(ptr undef, <2 x i1> undef, i32 undef)
+  %t9 = load <2 x i64>, ptr undef
+  %t10 = call <4 x i64> @llvm.vp.load.v4i64(ptr undef, <4 x i1> undef, i32 undef)
+  %t12 = load <4 x i64>, ptr undef
+  %t13 = call <8 x i64> @llvm.vp.load.v8i64(ptr undef, <8 x i1> undef, i32 undef)
+  %t14 = load <8 x i64>, ptr undef
+  %t15 = call <16 x i64> @llvm.vp.load.v16i64(ptr undef, <16 x i1> undef, i32 undef)
+  %t16 = load <16 x i64>, ptr undef
+  %t17 = call <vscale x 2 x i8> @llvm.vp.load.nv2i8(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+  %t18 = load <vscale x 2 x i8>, ptr undef
+  %t19 = call <vscale x 4 x i8> @llvm.vp.load.nv4i8(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+  %t20 = load <vscale x 4 x i8>, ptr undef
+  %t21 = call <vscale x 8 x i8> @llvm.vp.load.nv8i8(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+  %t22 = load <vscale x 8 x i8>, ptr undef
+  %t23 = call <vscale x 16 x i8> @llvm.vp.load.nv16i8(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+  %t24 = load <vscale x 16 x i8>, ptr undef
+  %t25 = call <vscale x 2 x i64> @llvm.vp.load.nv2i64(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+  %t26 = load <vscale x 2 x i64>, ptr undef
+  %t27 = call <vscale x 4 x i64> @llvm.vp.load.nv4i64(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+  %t28 = load <vscale x 4 x i64>, ptr undef
+  %t29 = call <vscale x 8 x i64> @llvm.vp.load.nv8i64(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+  %t30 = load <vscale x 8 x i64>, ptr undef
+  %t31 = call <vscale x 16 x i64> @llvm.vp.load.nv16i64(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+  %t32 = load <vscale x 16 x i64>, ptr undef
+  ret void
+}
+
+define void @store() {
+; CHECK-LABEL: 'store'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v4i8.p0(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> undef, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.store.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i64> undef, ptr undef, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.store.v16i64.p0(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i64> undef, ptr undef, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv2i8.p0(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i8> undef, ptr undef, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i8> undef, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 8 x i8> undef, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 16 x i8> undef, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 2 x i64> undef, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <vscale x 4 x i64> undef, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <vscale x 8 x i64> undef, ptr undef, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.nxv16i64.p0(<vscale x 16 x i64> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <vscale x 16 x i64> undef, ptr undef, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.vp.store.v2i8(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef)
+  store <2 x i8> undef, ptr undef
+  call void @llvm.vp.store.v4i8(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef)
+  store <4 x i8> undef, ptr undef
+  call void @llvm.vp.store.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef, i32 undef)
+  store <8 x i8> undef, ptr undef
+  call void @llvm.vp.store.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef, i32 undef)
+  store <16 x i8> undef, ptr undef
+  call void @llvm.vp.store.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef, i32 undef)
+  store <2 x i64> undef, ptr undef
+  call void @llvm.vp.store.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef, i32 undef)
+  store <4 x i64> undef, ptr undef
+  call void @llvm.vp.store.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef, i32 undef)
+  store <8 x i64> undef, ptr undef
+  call void @llvm.vp.store.v16i64(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef)
+  store <16 x i64> undef, ptr undef
+  call void @llvm.vp.store.nv2i8(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+  store <vscale x 2 x i8> undef, ptr undef
+  call void @llvm.vp.store.nv4i8(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+  store <vscale x 4 x i8> undef, ptr undef
+  call void @llvm.vp.store.nv8i8(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+  store <vscale x 8 x i8> undef, ptr undef
+  call void @llvm.vp.store.nv16i8(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+  store <vscale x 16 x i8> undef, ptr undef
+  call void @llvm.vp.store.nv2i64(<vscale x 2 x i64> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+  store <vscale x 2 x i64> undef, ptr undef
+  call void @llvm.vp.store.nv4i64(<vscale x 4 x i64> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+  store <vscale x 4 x i64> undef, ptr undef
+  call void @llvm.vp.store.nv8i64(<vscale x 8 x i64> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+  store <vscale x 8 x i64> undef, ptr undef
+  call void @llvm.vp.store.nv16i64(<vscale x 16 x i64> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+  store <vscale x 16 x i64> undef, ptr undef
+  ret void
+}
+
+declare <2 x i8> @llvm.vp.add.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+declare <4 x i8> @llvm.vp.add.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+declare <8 x i8> @llvm.vp.add.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+declare <16 x i8> @llvm.vp.add.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+declare <2 x i64> @llvm.vp.add.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+declare <4 x i64> @llvm.vp.add.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+declare <8 x i64> @llvm.vp.add.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+declare <16 x i64> @llvm.vp.add.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.add.nv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.add.nv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.add.nv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.add.nv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.add.nv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.add.nv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.add.nv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i64> @llvm.vp.add.nv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, <vscale x 16 x i1>, i32)
+
+declare <2 x i8> @llvm.vp.abs.v2i8(<2 x i8>, i1, <2 x i1>, i32)
+declare <4 x i8> @llvm.vp.abs.v4i8(<4 x i8>, i1, <4 x i1>, i32)
+declare <8 x i8> @llvm.vp.abs.v8i8(<8 x i8>, i1, <8 x i1>, i32)
+declare <16 x i8> @llvm.vp.abs.v16i8(<16 x i8>, i1, <16 x i1>, i32)
+declare <2 x i64> @llvm.vp.abs.v2i64(<2 x i64>, i1, <2 x i1>, i32)
+declare <4 x i64> @llvm.vp.abs.v4i64(<4 x i64>, i1, <4 x i1>, i32)
+declare <8 x i64> @llvm.vp.abs.v8i64(<8 x i64>, i1, <8 x i1>, i32)
+declare <16 x i64> @llvm.vp.abs.v16i64(<16 x i64>, i1, <16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.abs.nv2i8(<vscale x 2 x i8>, i1, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.abs.nv4i8(<vscale x 4 x i8>, i1, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.abs.nv8i8(<vscale x 8 x i8>, i1, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.abs.nv16i8(<vscale x 16 x i8>, i1, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.abs.nv2i64(<vscale x 2 x i64>, i1, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.abs.nv4i64(<vscale x 4 x i64>, i1, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.abs.nv8i64(<vscale x 8 x i64>, i1, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i64> @llvm.vp.abs.nv16i64(<vscale x 16 x i64>, i1, <vscale x 16 x i1>, i32)
+
+declare <2 x i8> @llvm.abs.v2i8(<2 x i8>, i1)
+declare <4 x i8> @llvm.abs.v4i8(<4 x i8>, i1)
+declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1)
+declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)
+declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1)
+declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
+declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
+declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
+declare <vscale x 2 x i8> @llvm.abs.nv2i8(<vscale x 2 x i8>, i1)
+declare <vscale x 4 x i8> @llvm.abs.nv4i8(<vscale x 4 x i8>, i1)
+declare <vscale x 8 x i8> @llvm.abs.nv8i8(<vscale x 8 x i8>, i1)
+declare <vscale x 16 x i8> @llvm.abs.nv16i8(<vscale x 16 x i8>, i1)
+declare <vscale x 2 x i64> @llvm.abs.nv2i64(<vscale x 2 x i64>, i1)
+declare <vscale x 4 x i64> @llvm.abs.nv4i64(<vscale x 4 x i64>, i1)
+declare <vscale x 8 x i64> @llvm.abs.nv8i64(<vscale x 8 x i64>, i1)
+declare <vscale x 16 x i64> @llvm.abs.nv16i64(<vscale x 16 x i64>, i1)
+
+declare <2 x i8> @llvm.vp.load.v2i8(ptr, <2 x i1>, i32)
+declare <4 x i8> @llvm.vp.load.v4i8(ptr, <4 x i1>, i32)
+declare <8 x i8> @llvm.vp.load.v8i8(ptr, <8 x i1>, i32)
+declare <16 x i8> @llvm.vp.load.v16i8(ptr, <16 x i1>, i32)
+declare <2 x i64> @llvm.vp.load.v2i64(ptr, <2 x i1>, i32)
+declare <4 x i64> @llvm.vp.load.v4i64(ptr, <4 x i1>, i32)
+declare <8 x i64> @llvm.vp.load.v8i64(ptr, <8 x i1>, i32)
+declare <16 x i64> @llvm.vp.load.v16i64(ptr, <16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.load.nv2i8(ptr, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.load.nv4i8(ptr, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.load.nv8i8(ptr, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.load.nv16i8(ptr, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.load.nv2i64(ptr, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.load.nv4i64(ptr, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.load.nv8i64(ptr, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i64> @llvm.vp.load.nv16i64(ptr, <vscale x 16 x i1>, i32)
+
+declare void @llvm.vp.store.v2i8(<2 x i8>, ptr, <2 x i1>, i32)
+declare void @llvm.vp.store.v4i8(<4 x i8>, ptr, <4 x i1>, i32)
+declare void @llvm.vp.store.v8i8(<8 x i8>, ptr, <8 x i1>, i32)
+declare void @llvm.vp.store.v16i8(<16 x i8>, ptr, <16 x i1>, i32)
+declare void @llvm.vp.store.v2i64(<2 x i64>, ptr, <2 x i1>, i32)
+declare void @llvm.vp.store.v4i64(<4 x i64>, ptr, <4 x i1>, i32)
+declare void @llvm.vp.store.v8i64(<8 x i64>, ptr, <8 x i1>, i32)
+declare void @llvm.vp.store.v16i64(<16 x i64>, ptr, <16 x i1>, i32)
+declare void @llvm.vp.store.nv2i8(<vscale x 2 x i8>, ptr, <vscale x 2 x i1>, i32)
+declare void @llvm.vp.store.nv4i8(<vscale x 4 x i8>, ptr, <vscale x 4 x i1>, i32)
+declare void @llvm.vp.store.nv8i8(<vscale x 8 x i8>, ptr, <vscale x 8 x i1>, i32)
+declare void @llvm.vp.store.nv16i8(<vscale x 16 x i8>, ptr, <vscale x 16 x i1>, i32)
+declare void @llvm.vp.store.nv2i64(<vscale x 2 x i64>, ptr, <vscale x 2 x i1>, i32)
+declare void @llvm.vp.store.nv4i64(<vscale x 4 x i64>, ptr, <vscale x 4 x i1>, i32)
+declare void @llvm.vp.store.nv8i64(<vscale x 8 x i64>, ptr, <vscale x 8 x i1>, i32)
+declare void @llvm.vp.store.nv16i64(<vscale x 16 x i64>, ptr, <vscale x 16 x i1>, i32)
+
 declare <vscale x 1 x i32> @llvm.fshr.nxv4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
 declare <vscale x 1 x i32> @llvm.fshl.nxv4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
 
-
 declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float>, i32)
 declare <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float>)