[llvm] e7b89c2 - Add BasicTTIImpl cost model for llvm.get.active.lane.mask intrinsic

Mon Mar 14 02:35:23 PDT 2022

Author: David Sherwood
Date: 2022-03-14T09:35:05Z
New Revision: e7b89c2fc359b3f4cab7b8b30d06e39b20ef6ddf

URL: https://github.com/llvm/llvm-project/commit/e7b89c2fc359b3f4cab7b8b30d06e39b20ef6ddf
DIFF: https://github.com/llvm/llvm-project/commit/e7b89c2fc359b3f4cab7b8b30d06e39b20ef6ddf.diff

LOG: Add BasicTTIImpl cost model for llvm.get.active.lane.mask intrinsic

The vectoriser sometimes generates predicated vector loops using
the llvm.get.active.lane.mask intrinsic so it's important that we
are able to calculate a valid cost for the call instruction. When
SVE is enabled we are able to use a single whilelo instruction
for some vector types - in such cases I've marked the cost as 1.
For all other cases I've set the cost according to how the intrinsic
will be expanded.

Tests added here:

  Analysis/CostModel/AArch64/sve-intrinsics.ll
  Analysis/CostModel/ARM/active_lane_mask.ll
  Analysis/CostModel/RISCV/active_lane_mask.ll

Differential Revision: https://reviews.llvm.org/D121109

Added: 
    llvm/test/Analysis/CostModel/ARM/active_lane_mask.ll
    llvm/test/Analysis/CostModel/RISCV/active_lane_mask.ll

Modified: 
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 0b2737628923c..8e62dfff117b5 100644

--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1512,6 +1512,29 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       }
       return Cost;
     }
+    case Intrinsic::get_active_lane_mask: {
+      EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
+      EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
+
+      // If we're not expanding the intrinsic then we assume this is cheap
+      // to implement.
+      if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
+        std::pair<InstructionCost, MVT> LT =
+            getTLI()->getTypeLegalizationCost(DL, RetTy);
+        return LT.first;
+      }
+
+      // Create the expanded types that will be used to calculate the uadd_sat
+      // operation.
+      Type *ExpRetTy = VectorType::get(
+          ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
+      IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
+      InstructionCost Cost =
+          thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
+                                          CmpInst::ICMP_ULT, CostKind);
+      return Cost;
+    }
     }
 
     // Assume that we need to scalarize this intrinsic.

diff  --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 9c840e1e905a7..03b51cd510ea7 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -408,4 +408,77 @@ declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale
 declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
 declare <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
 
+define void @get_lane_mask() #0 {
+; CHECK-LABEL: 'get_lane_mask'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction:   %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:   %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction:   %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   ret void
+  %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
+  %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
+  %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
+  %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
+
+  %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
+  %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
+  %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
+  %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
+
+  %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
+  %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
+
+  %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+  %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+  %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+  %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+
+  %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+  %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+  %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+  %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+
+  %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+  %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+
+  ret void
+}
+
+declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64)
+declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64)
+declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
+declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64)
+declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32, i32)
+declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32, i32)
+declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32)
+declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32, i32)
+declare <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64)
+declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16, i16)
+declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64, i64)
+declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64, i64)
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64, i64)
+declare <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64, i64)
+declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
+declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
+declare <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32, i32)
+declare <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64, i64)
+declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16, i16)
+
 attributes #0 = { "target-features"="+sve,+bf16" }

diff  --git a/llvm/test/Analysis/CostModel/ARM/active_lane_mask.ll b/llvm/test/Analysis/CostModel/ARM/active_lane_mask.ll
new file mode 100644
index 0000000000000..70e5c874b93d5
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/active_lane_mask.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -passes='print<cost-model>' 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi | FileCheck %s
+
+define void @get_lane_mask() {
+; CHECK-LABEL: 'get_lane_mask'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %mask_v8i1_i16 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %mask_v4i1_i16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %mask_v2i1_i16 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+  %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+  %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+  %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+
+  %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+  %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+  %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+  %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+
+  %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+  %mask_v8i1_i16 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i16(i16 undef, i16 undef)
+  %mask_v4i1_i16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i16(i16 undef, i16 undef)
+  %mask_v2i1_i16 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i16(i16 undef, i16 undef)
+
+  ret void
+}
+
+declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64, i64)
+declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64, i64)
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64, i64)
+declare <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64, i64)
+declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
+declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
+declare <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32, i32)
+declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16, i16)
+declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i16(i16, i16)
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i16(i16, i16)
+declare <2 x i1> @llvm.get.active.lane.mask.v2i1.i16(i16, i16)

diff  --git a/llvm/test/Analysis/CostModel/RISCV/active_lane_mask.ll b/llvm/test/Analysis/CostModel/RISCV/active_lane_mask.ll
new file mode 100644
index 0000000000000..77ff99cda11ef
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/active_lane_mask.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -passes='print<cost-model>' 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh | FileCheck %s
+
+define void @get_lane_mask() {
+; CHECK-LABEL: 'get_lane_mask'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv1i1_i64 = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv1i1_i32 = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
+  %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
+  %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
+  %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
+  %mask_nxv1i1_i64 = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 undef, i64 undef)
+
+  %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
+  %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
+  %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
+  %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
+  %mask_nxv1i1_i32 = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 undef, i32 undef)
+
+  %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
+  %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
+
+  %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+  %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+  %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+  %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+
+  %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+  %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+  %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+  %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+
+  %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+  %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+
+  ret void
+}
+
+declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64)
+declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64)
+declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
+declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64)
+declare <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64, i64)
+declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32, i32)
+declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32, i32)
+declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32)
+declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32, i32)
+declare <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32, i32)
+declare <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64)
+declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16, i16)
+declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64, i64)
+declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64, i64)
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64, i64)
+declare <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64, i64)
+declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
+declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
+declare <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32, i32)
+declare <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64, i64)
+declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16, i16)