[llvm] Vector masked extract last active element intrinsic (PR #113587)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 14 06:13:20 PST 2024
https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/113587
>From e4d20fbb85af1d3054905d700637e5e076e70d0a Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 23 Oct 2024 14:23:56 +0000
Subject: [PATCH 1/3] Initial working version via scalarization
---
llvm/docs/LangRef.rst | 30 +
llvm/include/llvm/IR/Intrinsics.td | 6 +
.../SelectionDAG/SelectionDAGBuilder.cpp | 23 +
.../CodeGen/AArch64/vector-masked-extract.ll | 663 ++++++++++++++++++
4 files changed, 722 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/vector-masked-extract.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ef38c5ab33b926..ef2965a1a19610 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20002,6 +20002,36 @@ the follow sequence of operations:
The ``mask`` operand will apply to at least the gather and scatter operations.
+'``llvm.experimental.vector.masked.extract.last.active``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This is an overloaded intrinsic.
+
+This intrinsic will extract the value from a single lane of a vector, based
+on a supplied mask vector.
+
+::
+
+ declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru)
+ declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
+
+Arguments:
+""""""""""
+
+The first argument is the data vector to extract a lane from. The second is a
+mask vector controlling the extraction. The third argument is a passthru
+value.
+
+The two input vectors must have the same number of elements, and the type of
+the passthru value must match that of the elements of the data vector.
+
+Semantics:
+""""""""""
+
+The '``llvm.experimental.vector.masked.extract.last.active``' intrinsic will
+find the index of the most significant active lane in the mask vector, and
+extract the element at that index in the corresponding data vector. If no mask
+lanes are active then the passthru value is returned instead.
.. _int_vector_compress:
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 8ed57f818d6006..bd2edd9f950369 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1920,6 +1920,12 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
[ IntrArgMemOnly ]>;
+// Extract based on mask bits
+def int_experimental_vector_masked_extract_last_active:
+ DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+ [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ LLVMVectorElementType<0>], [IntrNoMem]>;
+
// Operators
let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
// Integer arithmetic
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3b046aa25f5444..ea0b0330e981b1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8207,6 +8207,29 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
visitVectorHistogram(I, Intrinsic);
return;
}
+ case Intrinsic::experimental_vector_masked_extract_last_active: {
+ SDValue Data = getValue(I.getOperand(0));
+ SDValue Mask = getValue(I.getOperand(1));
+ SDValue PassThru = getValue(I.getOperand(2));
+
+ EVT DataVT = Data.getValueType();
+ EVT ScalarVT = PassThru.getValueType();
+ EVT BoolVT = Mask.getValueType().getScalarType();
+ EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+ EVT IdxVecVT = DataVT.changeVectorElementType(IdxVT);
+
+ SDValue Zeroes = DAG.getConstant(0, sdl, IdxVecVT);
+ SDValue StepVec = DAG.getStepVector(sdl, IdxVecVT);
+ SDValue ActiveElts = DAG.getSelect(sdl, IdxVecVT, Mask, StepVec, Zeroes);
+ SDValue HighestIdx =
+ DAG.getNode(ISD::VECREDUCE_UMAX, sdl, IdxVT, ActiveElts);
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, HighestIdx);
+ SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
+ SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru);
+ setValue(&I, Result);
+ return;
+ }
}
}
diff --git a/llvm/test/CodeGen/AArch64/vector-masked-extract.ll b/llvm/test/CodeGen/AArch64/vector-masked-extract.ll
new file mode 100644
index 00000000000000..04adf4e476b041
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-masked-extract.ll
@@ -0,0 +1,663 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,NEON-FIXED
+; RUN: llc -mtriple=aarch64 -mattr=+sve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SVE-FIXED
+
+define i8 @extract_last_i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i8:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: umov w15, v1.b[14]
+; NEON-FIXED-NEXT: umov w14, v1.b[6]
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_0
+; NEON-FIXED-NEXT: umov w12, v1.b[15]
+; NEON-FIXED-NEXT: umov w13, v1.b[10]
+; NEON-FIXED-NEXT: ldr q2, [x8, :lo12:.LCPI0_0]
+; NEON-FIXED-NEXT: umov w11, v1.b[2]
+; NEON-FIXED-NEXT: umov w8, v1.b[7]
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: umov w9, v1.b[11]
+; NEON-FIXED-NEXT: umov w10, v1.b[3]
+; NEON-FIXED-NEXT: umov w16, v1.b[12]
+; NEON-FIXED-NEXT: fmov s3, w15
+; NEON-FIXED-NEXT: umov w15, v1.b[4]
+; NEON-FIXED-NEXT: fmov s4, w14
+; NEON-FIXED-NEXT: fmov s5, w13
+; NEON-FIXED-NEXT: umov w13, v1.b[0]
+; NEON-FIXED-NEXT: umov w14, v1.b[13]
+; NEON-FIXED-NEXT: fmov s6, w11
+; NEON-FIXED-NEXT: umov w11, v1.b[5]
+; NEON-FIXED-NEXT: mov v3.s[1], w12
+; NEON-FIXED-NEXT: umov w12, v1.b[8]
+; NEON-FIXED-NEXT: mov v4.s[1], w8
+; NEON-FIXED-NEXT: umov w8, v1.b[9]
+; NEON-FIXED-NEXT: mov v5.s[1], w9
+; NEON-FIXED-NEXT: umov w9, v1.b[1]
+; NEON-FIXED-NEXT: fmov s7, w16
+; NEON-FIXED-NEXT: fmov s16, w15
+; NEON-FIXED-NEXT: mov v6.s[1], w10
+; NEON-FIXED-NEXT: fmov s18, w13
+; NEON-FIXED-NEXT: shl v1.16b, v1.16b, #7
+; NEON-FIXED-NEXT: fmov s17, w12
+; NEON-FIXED-NEXT: ushll v3.2d, v3.2s, #0
+; NEON-FIXED-NEXT: ushll v4.2d, v4.2s, #0
+; NEON-FIXED-NEXT: mov v7.s[1], w14
+; NEON-FIXED-NEXT: mov v16.s[1], w11
+; NEON-FIXED-NEXT: ushll v5.2d, v5.2s, #0
+; NEON-FIXED-NEXT: mov v18.s[1], w9
+; NEON-FIXED-NEXT: adrp x9, .LCPI0_2
+; NEON-FIXED-NEXT: ushll v6.2d, v6.2s, #0
+; NEON-FIXED-NEXT: ldr q20, [x9, :lo12:.LCPI0_2]
+; NEON-FIXED-NEXT: adrp x9, .LCPI0_7
+; NEON-FIXED-NEXT: mov v17.s[1], w8
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_1
+; NEON-FIXED-NEXT: ldr q23, [x9, :lo12:.LCPI0_7]
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr q19, [x8, :lo12:.LCPI0_1]
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_3
+; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63
+; NEON-FIXED-NEXT: shl v4.2d, v4.2d, #63
+; NEON-FIXED-NEXT: ushll v7.2d, v7.2s, #0
+; NEON-FIXED-NEXT: shl v5.2d, v5.2d, #63
+; NEON-FIXED-NEXT: ushll v16.2d, v16.2s, #0
+; NEON-FIXED-NEXT: ushll v17.2d, v17.2s, #0
+; NEON-FIXED-NEXT: shl v6.2d, v6.2d, #63
+; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT: ushll v18.2d, v18.2s, #0
+; NEON-FIXED-NEXT: cmlt v1.16b, v1.16b, #0
+; NEON-FIXED-NEXT: cmlt v4.2d, v4.2d, #0
+; NEON-FIXED-NEXT: cmlt v5.2d, v5.2d, #0
+; NEON-FIXED-NEXT: cmlt v6.2d, v6.2d, #0
+; NEON-FIXED-NEXT: and v2.16b, v3.16b, v2.16b
+; NEON-FIXED-NEXT: shl v3.2d, v7.2d, #63
+; NEON-FIXED-NEXT: shl v7.2d, v16.2d, #63
+; NEON-FIXED-NEXT: shl v16.2d, v17.2d, #63
+; NEON-FIXED-NEXT: ldr q17, [x8, :lo12:.LCPI0_3]
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_4
+; NEON-FIXED-NEXT: ldr q21, [x8, :lo12:.LCPI0_4]
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_5
+; NEON-FIXED-NEXT: shl v18.2d, v18.2d, #63
+; NEON-FIXED-NEXT: ldr q22, [x8, :lo12:.LCPI0_5]
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_6
+; NEON-FIXED-NEXT: and v4.16b, v4.16b, v19.16b
+; NEON-FIXED-NEXT: ldr q19, [x8, :lo12:.LCPI0_6]
+; NEON-FIXED-NEXT: cmlt v16.2d, v16.2d, #0
+; NEON-FIXED-NEXT: and v5.16b, v5.16b, v20.16b
+; NEON-FIXED-NEXT: cmlt v18.2d, v18.2d, #0
+; NEON-FIXED-NEXT: and v6.16b, v6.16b, v17.16b
+; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT: cmlt v7.2d, v7.2d, #0
+; NEON-FIXED-NEXT: umaxv b1, v1.16b
+; NEON-FIXED-NEXT: and v16.16b, v16.16b, v19.16b
+; NEON-FIXED-NEXT: and v17.16b, v18.16b, v23.16b
+; NEON-FIXED-NEXT: cmhi v18.2d, v4.2d, v2.2d
+; NEON-FIXED-NEXT: cmhi v19.2d, v6.2d, v5.2d
+; NEON-FIXED-NEXT: and v3.16b, v3.16b, v21.16b
+; NEON-FIXED-NEXT: and v7.16b, v7.16b, v22.16b
+; NEON-FIXED-NEXT: cmhi v21.2d, v17.2d, v16.2d
+; NEON-FIXED-NEXT: bit v2.16b, v4.16b, v18.16b
+; NEON-FIXED-NEXT: mov v4.16b, v19.16b
+; NEON-FIXED-NEXT: cmhi v20.2d, v7.2d, v3.2d
+; NEON-FIXED-NEXT: bsl v4.16b, v6.16b, v5.16b
+; NEON-FIXED-NEXT: mov v5.16b, v21.16b
+; NEON-FIXED-NEXT: bit v3.16b, v7.16b, v20.16b
+; NEON-FIXED-NEXT: bsl v5.16b, v17.16b, v16.16b
+; NEON-FIXED-NEXT: cmhi v6.2d, v4.2d, v2.2d
+; NEON-FIXED-NEXT: cmhi v7.2d, v5.2d, v3.2d
+; NEON-FIXED-NEXT: bit v2.16b, v4.16b, v6.16b
+; NEON-FIXED-NEXT: bit v3.16b, v5.16b, v7.16b
+; NEON-FIXED-NEXT: cmhi v4.2d, v3.2d, v2.2d
+; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; NEON-FIXED-NEXT: cmhi d4, d2, d3
+; NEON-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT: fmov x8, d2
+; NEON-FIXED-NEXT: bfxil x9, x8, #0, #4
+; NEON-FIXED-NEXT: ldrb w8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_i8:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: umov w8, v1.b[14]
+; SVE-FIXED-NEXT: umov w9, v1.b[6]
+; SVE-FIXED-NEXT: index z2.d, #0, #1
+; SVE-FIXED-NEXT: umov w12, v1.b[2]
+; SVE-FIXED-NEXT: umov w10, v1.b[10]
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: umov w13, v1.b[12]
+; SVE-FIXED-NEXT: umov w11, v1.b[15]
+; SVE-FIXED-NEXT: umov w14, v1.b[4]
+; SVE-FIXED-NEXT: umov w16, v1.b[0]
+; SVE-FIXED-NEXT: umov w15, v1.b[8]
+; SVE-FIXED-NEXT: fmov s3, w8
+; SVE-FIXED-NEXT: umov w8, v1.b[7]
+; SVE-FIXED-NEXT: fmov s4, w9
+; SVE-FIXED-NEXT: umov w9, v1.b[11]
+; SVE-FIXED-NEXT: fmov s6, w12
+; SVE-FIXED-NEXT: umov w12, v1.b[3]
+; SVE-FIXED-NEXT: fmov s5, w10
+; SVE-FIXED-NEXT: umov w10, v1.b[1]
+; SVE-FIXED-NEXT: fmov s7, w13
+; SVE-FIXED-NEXT: umov w13, v1.b[13]
+; SVE-FIXED-NEXT: fmov s16, w14
+; SVE-FIXED-NEXT: fmov s18, w16
+; SVE-FIXED-NEXT: mov v4.s[1], w8
+; SVE-FIXED-NEXT: umov w8, v1.b[5]
+; SVE-FIXED-NEXT: mov v3.s[1], w11
+; SVE-FIXED-NEXT: mov v5.s[1], w9
+; SVE-FIXED-NEXT: mov v6.s[1], w12
+; SVE-FIXED-NEXT: umov w9, v1.b[9]
+; SVE-FIXED-NEXT: fmov s17, w15
+; SVE-FIXED-NEXT: mov v18.s[1], w10
+; SVE-FIXED-NEXT: mov z19.d, z2.d
+; SVE-FIXED-NEXT: mov v7.s[1], w13
+; SVE-FIXED-NEXT: mov z20.d, z2.d
+; SVE-FIXED-NEXT: mov z21.d, z2.d
+; SVE-FIXED-NEXT: mov v16.s[1], w8
+; SVE-FIXED-NEXT: ushll v3.2d, v3.2s, #0
+; SVE-FIXED-NEXT: ushll v4.2d, v4.2s, #0
+; SVE-FIXED-NEXT: ushll v5.2d, v5.2s, #0
+; SVE-FIXED-NEXT: ushll v6.2d, v6.2s, #0
+; SVE-FIXED-NEXT: mov v17.s[1], w9
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: ushll v18.2d, v18.2s, #0
+; SVE-FIXED-NEXT: mov z25.d, z2.d
+; SVE-FIXED-NEXT: ushll v7.2d, v7.2s, #0
+; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63
+; SVE-FIXED-NEXT: shl v4.2d, v4.2d, #63
+; SVE-FIXED-NEXT: ushll v16.2d, v16.2s, #0
+; SVE-FIXED-NEXT: shl v5.2d, v5.2d, #63
+; SVE-FIXED-NEXT: shl v6.2d, v6.2d, #63
+; SVE-FIXED-NEXT: mov z22.d, z2.d
+; SVE-FIXED-NEXT: mov z23.d, z2.d
+; SVE-FIXED-NEXT: add z19.d, z19.d, #6 // =0x6
+; SVE-FIXED-NEXT: shl v18.2d, v18.2d, #63
+; SVE-FIXED-NEXT: ushll v17.2d, v17.2s, #0
+; SVE-FIXED-NEXT: shl v7.2d, v7.2d, #63
+; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; SVE-FIXED-NEXT: cmlt v4.2d, v4.2d, #0
+; SVE-FIXED-NEXT: add z25.d, z25.d, #14 // =0xe
+; SVE-FIXED-NEXT: shl v16.2d, v16.2d, #63
+; SVE-FIXED-NEXT: cmlt v5.2d, v5.2d, #0
+; SVE-FIXED-NEXT: add z20.d, z20.d, #10 // =0xa
+; SVE-FIXED-NEXT: cmlt v6.2d, v6.2d, #0
+; SVE-FIXED-NEXT: add z21.d, z21.d, #2 // =0x2
+; SVE-FIXED-NEXT: mov z24.d, z2.d
+; SVE-FIXED-NEXT: shl v17.2d, v17.2d, #63
+; SVE-FIXED-NEXT: cmlt v18.2d, v18.2d, #0
+; SVE-FIXED-NEXT: cmlt v7.2d, v7.2d, #0
+; SVE-FIXED-NEXT: add z22.d, z22.d, #12 // =0xc
+; SVE-FIXED-NEXT: cmlt v16.2d, v16.2d, #0
+; SVE-FIXED-NEXT: add z23.d, z23.d, #4 // =0x4
+; SVE-FIXED-NEXT: and v3.16b, v3.16b, v25.16b
+; SVE-FIXED-NEXT: and v4.16b, v4.16b, v19.16b
+; SVE-FIXED-NEXT: and v5.16b, v5.16b, v20.16b
+; SVE-FIXED-NEXT: and v6.16b, v6.16b, v21.16b
+; SVE-FIXED-NEXT: cmlt v17.2d, v17.2d, #0
+; SVE-FIXED-NEXT: add z24.d, z24.d, #8 // =0x8
+; SVE-FIXED-NEXT: and v2.16b, v18.16b, v2.16b
+; SVE-FIXED-NEXT: and v7.16b, v7.16b, v22.16b
+; SVE-FIXED-NEXT: and v16.16b, v16.16b, v23.16b
+; SVE-FIXED-NEXT: cmhi v18.2d, v4.2d, v3.2d
+; SVE-FIXED-NEXT: shl v1.16b, v1.16b, #7
+; SVE-FIXED-NEXT: cmhi v19.2d, v6.2d, v5.2d
+; SVE-FIXED-NEXT: and v17.16b, v17.16b, v24.16b
+; SVE-FIXED-NEXT: cmhi v20.2d, v16.2d, v7.2d
+; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v18.16b
+; SVE-FIXED-NEXT: cmlt v1.16b, v1.16b, #0
+; SVE-FIXED-NEXT: mov v4.16b, v19.16b
+; SVE-FIXED-NEXT: cmhi v21.2d, v2.2d, v17.2d
+; SVE-FIXED-NEXT: umaxv b1, v1.16b
+; SVE-FIXED-NEXT: bsl v4.16b, v6.16b, v5.16b
+; SVE-FIXED-NEXT: mov v5.16b, v20.16b
+; SVE-FIXED-NEXT: bif v2.16b, v17.16b, v21.16b
+; SVE-FIXED-NEXT: bsl v5.16b, v16.16b, v7.16b
+; SVE-FIXED-NEXT: cmhi v6.2d, v4.2d, v3.2d
+; SVE-FIXED-NEXT: cmhi v7.2d, v2.2d, v5.2d
+; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v6.16b
+; SVE-FIXED-NEXT: bif v2.16b, v5.16b, v7.16b
+; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d
+; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT: cmhi d4, d2, d3
+; SVE-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT: fmov x8, d2
+; SVE-FIXED-NEXT: bfxil x9, x8, #0, #4
+; SVE-FIXED-NEXT: ldrb w8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %res = call i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru)
+ ret i8 %res
+}
+
+define i16 @extract_last_i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i16:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: // kill: def $d1 killed $d1 def $q1
+; NEON-FIXED-NEXT: umov w8, v1.b[6]
+; NEON-FIXED-NEXT: umov w9, v1.b[2]
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: umov w11, v1.b[4]
+; NEON-FIXED-NEXT: umov w12, v1.b[0]
+; NEON-FIXED-NEXT: umov w10, v1.b[7]
+; NEON-FIXED-NEXT: umov w13, v1.b[3]
+; NEON-FIXED-NEXT: umov w14, v1.b[5]
+; NEON-FIXED-NEXT: umov w15, v1.b[1]
+; NEON-FIXED-NEXT: shl v1.8b, v1.8b, #7
+; NEON-FIXED-NEXT: fmov s2, w8
+; NEON-FIXED-NEXT: adrp x8, .LCPI1_0
+; NEON-FIXED-NEXT: fmov s3, w9
+; NEON-FIXED-NEXT: fmov s4, w11
+; NEON-FIXED-NEXT: adrp x9, .LCPI1_1
+; NEON-FIXED-NEXT: ldr q6, [x8, :lo12:.LCPI1_0]
+; NEON-FIXED-NEXT: fmov s5, w12
+; NEON-FIXED-NEXT: adrp x8, .LCPI1_3
+; NEON-FIXED-NEXT: ldr q7, [x9, :lo12:.LCPI1_1]
+; NEON-FIXED-NEXT: mov v2.s[1], w10
+; NEON-FIXED-NEXT: mov v3.s[1], w13
+; NEON-FIXED-NEXT: adrp x10, .LCPI1_2
+; NEON-FIXED-NEXT: mov v4.s[1], w14
+; NEON-FIXED-NEXT: ldr q16, [x10, :lo12:.LCPI1_2]
+; NEON-FIXED-NEXT: ldr q17, [x8, :lo12:.LCPI1_3]
+; NEON-FIXED-NEXT: mov v5.s[1], w15
+; NEON-FIXED-NEXT: cmlt v1.8b, v1.8b, #0
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ushll v2.2d, v2.2s, #0
+; NEON-FIXED-NEXT: ushll v3.2d, v3.2s, #0
+; NEON-FIXED-NEXT: ushll v4.2d, v4.2s, #0
+; NEON-FIXED-NEXT: umaxv b1, v1.8b
+; NEON-FIXED-NEXT: ushll v5.2d, v5.2s, #0
+; NEON-FIXED-NEXT: shl v2.2d, v2.2d, #63
+; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63
+; NEON-FIXED-NEXT: shl v4.2d, v4.2d, #63
+; NEON-FIXED-NEXT: shl v5.2d, v5.2d, #63
+; NEON-FIXED-NEXT: cmlt v2.2d, v2.2d, #0
+; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT: cmlt v4.2d, v4.2d, #0
+; NEON-FIXED-NEXT: cmlt v5.2d, v5.2d, #0
+; NEON-FIXED-NEXT: and v2.16b, v2.16b, v6.16b
+; NEON-FIXED-NEXT: and v3.16b, v3.16b, v7.16b
+; NEON-FIXED-NEXT: and v4.16b, v4.16b, v16.16b
+; NEON-FIXED-NEXT: and v5.16b, v5.16b, v17.16b
+; NEON-FIXED-NEXT: cmhi v6.2d, v3.2d, v2.2d
+; NEON-FIXED-NEXT: cmhi v7.2d, v5.2d, v4.2d
+; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v6.16b
+; NEON-FIXED-NEXT: mov v3.16b, v7.16b
+; NEON-FIXED-NEXT: bsl v3.16b, v5.16b, v4.16b
+; NEON-FIXED-NEXT: cmhi v4.2d, v3.2d, v2.2d
+; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; NEON-FIXED-NEXT: cmhi d4, d2, d3
+; NEON-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT: fmov x8, d2
+; NEON-FIXED-NEXT: bfi x9, x8, #1, #3
+; NEON-FIXED-NEXT: ldrh w8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_i16:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: // kill: def $d1 killed $d1 def $q1
+; SVE-FIXED-NEXT: umov w8, v1.b[0]
+; SVE-FIXED-NEXT: umov w10, v1.b[6]
+; SVE-FIXED-NEXT: index z6.d, #0, #1
+; SVE-FIXED-NEXT: umov w11, v1.b[2]
+; SVE-FIXED-NEXT: umov w14, v1.b[4]
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: umov w9, v1.b[1]
+; SVE-FIXED-NEXT: umov w12, v1.b[7]
+; SVE-FIXED-NEXT: umov w13, v1.b[3]
+; SVE-FIXED-NEXT: fmov s2, w8
+; SVE-FIXED-NEXT: umov w8, v1.b[5]
+; SVE-FIXED-NEXT: fmov s3, w10
+; SVE-FIXED-NEXT: fmov s4, w11
+; SVE-FIXED-NEXT: fmov s5, w14
+; SVE-FIXED-NEXT: mov z7.d, z6.d
+; SVE-FIXED-NEXT: mov z16.d, z6.d
+; SVE-FIXED-NEXT: mov z17.d, z6.d
+; SVE-FIXED-NEXT: shl v1.8b, v1.8b, #7
+; SVE-FIXED-NEXT: mov v2.s[1], w9
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: mov v3.s[1], w12
+; SVE-FIXED-NEXT: mov v4.s[1], w13
+; SVE-FIXED-NEXT: mov v5.s[1], w8
+; SVE-FIXED-NEXT: add z7.d, z7.d, #2 // =0x2
+; SVE-FIXED-NEXT: add z17.d, z17.d, #6 // =0x6
+; SVE-FIXED-NEXT: add z16.d, z16.d, #4 // =0x4
+; SVE-FIXED-NEXT: cmlt v1.8b, v1.8b, #0
+; SVE-FIXED-NEXT: ushll v2.2d, v2.2s, #0
+; SVE-FIXED-NEXT: ushll v3.2d, v3.2s, #0
+; SVE-FIXED-NEXT: ushll v4.2d, v4.2s, #0
+; SVE-FIXED-NEXT: ushll v5.2d, v5.2s, #0
+; SVE-FIXED-NEXT: umaxv b1, v1.8b
+; SVE-FIXED-NEXT: shl v2.2d, v2.2d, #63
+; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63
+; SVE-FIXED-NEXT: shl v4.2d, v4.2d, #63
+; SVE-FIXED-NEXT: shl v5.2d, v5.2d, #63
+; SVE-FIXED-NEXT: cmlt v2.2d, v2.2d, #0
+; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; SVE-FIXED-NEXT: cmlt v4.2d, v4.2d, #0
+; SVE-FIXED-NEXT: cmlt v5.2d, v5.2d, #0
+; SVE-FIXED-NEXT: and v2.16b, v2.16b, v6.16b
+; SVE-FIXED-NEXT: and v3.16b, v3.16b, v17.16b
+; SVE-FIXED-NEXT: and v4.16b, v4.16b, v7.16b
+; SVE-FIXED-NEXT: and v5.16b, v5.16b, v16.16b
+; SVE-FIXED-NEXT: cmhi v6.2d, v4.2d, v3.2d
+; SVE-FIXED-NEXT: cmhi v7.2d, v2.2d, v5.2d
+; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v6.16b
+; SVE-FIXED-NEXT: bif v2.16b, v5.16b, v7.16b
+; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d
+; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT: cmhi d4, d2, d3
+; SVE-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT: fmov x8, d2
+; SVE-FIXED-NEXT: bfi x9, x8, #1, #3
+; SVE-FIXED-NEXT: ldrh w8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %res = call i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru)
+ ret i16 %res
+}
+
+define i32 @extract_last_i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i32:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: ushll v2.4s, v1.4h, #0
+; NEON-FIXED-NEXT: adrp x8, .LCPI2_0
+; NEON-FIXED-NEXT: adrp x9, .LCPI2_1
+; NEON-FIXED-NEXT: ldr q4, [x8, :lo12:.LCPI2_0]
+; NEON-FIXED-NEXT: ldr q5, [x9, :lo12:.LCPI2_1]
+; NEON-FIXED-NEXT: shl v1.4h, v1.4h, #15
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: ushll2 v3.2d, v2.4s, #0
+; NEON-FIXED-NEXT: ushll v2.2d, v2.2s, #0
+; NEON-FIXED-NEXT: cmlt v1.4h, v1.4h, #0
+; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63
+; NEON-FIXED-NEXT: shl v2.2d, v2.2d, #63
+; NEON-FIXED-NEXT: umaxv h1, v1.4h
+; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT: cmlt v2.2d, v2.2d, #0
+; NEON-FIXED-NEXT: and v3.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT: and v2.16b, v2.16b, v5.16b
+; NEON-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d
+; NEON-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT: bic v3.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT: ext v2.16b, v2.16b, v2.16b, #8
+; NEON-FIXED-NEXT: cmhi d4, d3, d2
+; NEON-FIXED-NEXT: bit v2.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT: fmov x8, d2
+; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
+; NEON-FIXED-NEXT: ldr w8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_i32:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: ushll v2.4s, v1.4h, #0
+; SVE-FIXED-NEXT: index z4.d, #0, #1
+; SVE-FIXED-NEXT: shl v1.4h, v1.4h, #15
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: ushll2 v3.2d, v2.4s, #0
+; SVE-FIXED-NEXT: ushll v2.2d, v2.2s, #0
+; SVE-FIXED-NEXT: cmlt v1.4h, v1.4h, #0
+; SVE-FIXED-NEXT: mov z5.d, z4.d
+; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63
+; SVE-FIXED-NEXT: shl v2.2d, v2.2d, #63
+; SVE-FIXED-NEXT: umaxv h1, v1.4h
+; SVE-FIXED-NEXT: add z5.d, z5.d, #2 // =0x2
+; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; SVE-FIXED-NEXT: cmlt v2.2d, v2.2d, #0
+; SVE-FIXED-NEXT: and v2.16b, v2.16b, v4.16b
+; SVE-FIXED-NEXT: and v3.16b, v3.16b, v5.16b
+; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d
+; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT: bic v3.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT: ext v2.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT: cmhi d4, d3, d2
+; SVE-FIXED-NEXT: bit v2.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT: fmov x8, d2
+; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
+; SVE-FIXED-NEXT: ldr w8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %res = call i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru)
+ ret i32 %res
+}
+
+define i64 @extract_last_i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru) {
+; CHECK-LABEL: extract_last_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: ushll v3.2d, v1.2s, #0
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: fmov d2, xzr
+; CHECK-NEXT: fmov d4, x8
+; CHECK-NEXT: shl v1.2s, v1.2s, #31
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: shl v3.2d, v3.2d, #63
+; CHECK-NEXT: cmlt v1.2s, v1.2s, #0
+; CHECK-NEXT: cmlt v3.2d, v3.2d, #0
+; CHECK-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT: and v3.8b, v3.8b, v4.8b
+; CHECK-NEXT: cmhi d2, d2, d3
+; CHECK-NEXT: bic v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: fmov x8, d2
+; CHECK-NEXT: orr x8, x9, x8, lsl #3
+; CHECK-NEXT: fmov w9, s1
+; CHECK-NEXT: ldr x8, [x8]
+; CHECK-NEXT: tst w9, #0x1
+; CHECK-NEXT: csel x0, x8, x0, ne
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru)
+ ret i64 %res
+}
+
+define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) #0 {
+; CHECK-LABEL: extract_last_i8_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: index z1.d, #0, #1
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: mov z3.d, #0 // =0x0
+; CHECK-NEXT: punpkhi p4.h, p0.b
+; CHECK-NEXT: punpklo p5.h, p2.b
+; CHECK-NEXT: punpkhi p1.h, p4.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: punpkhi p3.h, p2.b
+; CHECK-NEXT: punpklo p2.h, p4.b
+; CHECK-NEXT: incd z2.d
+; CHECK-NEXT: incd z5.d, all, mul #2
+; CHECK-NEXT: punpklo p4.h, p5.b
+; CHECK-NEXT: incd z6.d, all, mul #4
+; CHECK-NEXT: punpkhi p6.h, p1.b
+; CHECK-NEXT: punpkhi p7.h, p3.b
+; CHECK-NEXT: sel z1.d, p4, z1.d, z3.d
+; CHECK-NEXT: mov z4.d, z2.d
+; CHECK-NEXT: mov z7.d, z2.d
+; CHECK-NEXT: mov z25.d, z5.d
+; CHECK-NEXT: punpkhi p5.h, p5.b
+; CHECK-NEXT: punpkhi p4.h, p2.b
+; CHECK-NEXT: incd z4.d, all, mul #2
+; CHECK-NEXT: incd z25.d, all, mul #4
+; CHECK-NEXT: incd z7.d, all, mul #4
+; CHECK-NEXT: punpklo p3.h, p3.b
+; CHECK-NEXT: sel z2.d, p5, z2.d, z3.d
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: punpklo p2.h, p2.b
+; CHECK-NEXT: mov z24.d, z4.d
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: sel z5.d, p3, z5.d, z3.d
+; CHECK-NEXT: sel z4.d, p7, z4.d, z3.d
+; CHECK-NEXT: sel z6.d, p2, z6.d, z3.d
+; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: sel z25.d, p1, z25.d, z3.d
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: incd z24.d, all, mul #4
+; CHECK-NEXT: umax z1.d, p1/m, z1.d, z6.d
+; CHECK-NEXT: sel z24.d, p6, z24.d, z3.d
+; CHECK-NEXT: mov z3.d, p4/m, z7.d
+; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: umax z4.d, p1/m, z4.d, z24.d
+; CHECK-NEXT: umax z2.d, p1/m, z2.d, z3.d
+; CHECK-NEXT: movprfx z3, z5
+; CHECK-NEXT: umax z3.d, p1/m, z3.d, z25.d
+; CHECK-NEXT: umax z2.d, p1/m, z2.d, z4.d
+; CHECK-NEXT: umax z1.d, p1/m, z1.d, z3.d
+; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d
+; CHECK-NEXT: umaxv d1, p1, z1.d
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: whilels p1.b, xzr, x8
+; CHECK-NEXT: ptest p0, p0.b
+; CHECK-NEXT: lastb w8, p1, z0.b
+; CHECK-NEXT: csel w0, w8, w0, ne
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru)
+ ret i8 %res
+}
+
+define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) #0 {
+; CHECK-LABEL: extract_last_i16_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: index z1.d, #0, #1
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: mov z5.d, #0 // =0x0
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: punpkhi p3.h, p1.b
+; CHECK-NEXT: punpkhi p4.h, p2.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpklo p2.h, p2.b
+; CHECK-NEXT: incd z2.d
+; CHECK-NEXT: incd z3.d, all, mul #2
+; CHECK-NEXT: sel z1.d, p2, z1.d, z5.d
+; CHECK-NEXT: mov z4.d, z2.d
+; CHECK-NEXT: sel z2.d, p4, z2.d, z5.d
+; CHECK-NEXT: sel z3.d, p1, z3.d, z5.d
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: incd z4.d, all, mul #2
+; CHECK-NEXT: umax z1.d, p1/m, z1.d, z3.d
+; CHECK-NEXT: sel z4.d, p3, z4.d, z5.d
+; CHECK-NEXT: umax z2.d, p1/m, z2.d, z4.d
+; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d
+; CHECK-NEXT: umaxv d1, p1, z1.d
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: whilels p1.h, xzr, x8
+; CHECK-NEXT: lastb w8, p1, z0.h
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ptest p1, p0.b
+; CHECK-NEXT: csel w0, w8, w0, ne
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
+ ret i16 %res
+}
+
+define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru) #0 {
+; CHECK-LABEL: extract_last_i32_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z1.d, #0, #1
+; CHECK-NEXT: mov z3.d, #0 // =0x0
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: sel z1.d, p2, z1.d, z3.d
+; CHECK-NEXT: incd z2.d
+; CHECK-NEXT: sel z2.d, p1, z2.d, z3.d
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d
+; CHECK-NEXT: umaxv d1, p1, z1.d
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: whilels p1.s, xzr, x8
+; CHECK-NEXT: lastb w8, p1, z0.s
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ptest p1, p0.b
+; CHECK-NEXT: csel w0, w8, w0, ne
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru)
+ ret i32 %res
+}
+
+define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) #0 {
+; CHECK-LABEL: extract_last_i64_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z1.d, #0, #1
+; CHECK-NEXT: mov z2.d, #0 // =0x0
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sel z1.d, p0, z1.d, z2.d
+; CHECK-NEXT: umaxv d1, p1, z1.d
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: whilels p2.d, xzr, x8
+; CHECK-NEXT: ptest p1, p0.b
+; CHECK-NEXT: lastb x8, p2, z0.d
+; CHECK-NEXT: csel x0, x8, x0, ne
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru)
+ ret i64 %res
+}
+
+declare i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
+declare i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
+declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
+declare i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64>, <2 x i1>, i64)
+declare i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
+declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
+declare i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
+
+attributes #0 = { "target-features"="+sve" vscale_range(1, 16) }
>From 641715c419105dd9c18798a19509aa9a84a6856c Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 7 Nov 2024 17:20:20 +0000
Subject: [PATCH 2/3] Address review comments
---
llvm/docs/LangRef.rst | 19 +-
llvm/include/llvm/IR/Intrinsics.td | 2 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 32 +-
llvm/lib/IR/AutoUpgrade.cpp | 3 +
.../AArch64/vector-extract-last-active.ll | 420 +++++++++++
.../CodeGen/AArch64/vector-masked-extract.ll | 663 ------------------
6 files changed, 456 insertions(+), 683 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
delete mode 100644 llvm/test/CodeGen/AArch64/vector-masked-extract.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ef2965a1a19610..7f7eaa240c2956 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20002,18 +20002,15 @@ the follow sequence of operations:
The ``mask`` operand will apply to at least the gather and scatter operations.
-'``llvm.experimental.vector.masked.extract.last.active``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.experimental.vector.extract.last.active``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This is an overloaded intrinsic.
-This intrinsic will extract the value from a single lane of a vector, based
-on a supplied mask vector.
-
::
- declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru)
- declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
+ declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru)
+ declare i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
Arguments:
""""""""""
@@ -20028,10 +20025,10 @@ the passthru value must match that of the elements of the data vector.
Semantics:
""""""""""
-The '``llvm.experimental.vector.masked.extract.last.active``' intrinsic will
-find the index of the most significant active lane in the mask vector, and
-extract the element at that index in the corresponding data vector. If no mask
-lanes are active then the passthru value is returned instead.
+The '``llvm.experimental.vector.extract.last.active``' intrinsic will extract an
+element from the data vector at the index matching the highest active lane of
+the mask vector. If no mask lanes are active then the passthru value is
+returned instead.
.. _int_vector_compress:
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index bd2edd9f950369..02caa076d12c8f 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1921,7 +1921,7 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
[ IntrArgMemOnly ]>;
// Extract based on mask bits
-def int_experimental_vector_masked_extract_last_active:
+def int_experimental_vector_extract_last_active:
DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
LLVMVectorElementType<0>], [IntrNoMem]>;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ea0b0330e981b1..06755926841ac3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8207,7 +8207,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
visitVectorHistogram(I, Intrinsic);
return;
}
- case Intrinsic::experimental_vector_masked_extract_last_active: {
+ case Intrinsic::experimental_vector_extract_last_active: {
SDValue Data = getValue(I.getOperand(0));
SDValue Mask = getValue(I.getOperand(1));
SDValue PassThru = getValue(I.getOperand(2));
@@ -8215,16 +8215,32 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
EVT DataVT = Data.getValueType();
EVT ScalarVT = PassThru.getValueType();
EVT BoolVT = Mask.getValueType().getScalarType();
- EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
- EVT IdxVecVT = DataVT.changeVectorElementType(IdxVT);
- SDValue Zeroes = DAG.getConstant(0, sdl, IdxVecVT);
- SDValue StepVec = DAG.getStepVector(sdl, IdxVecVT);
- SDValue ActiveElts = DAG.getSelect(sdl, IdxVecVT, Mask, StepVec, Zeroes);
+ // Find a suitable type for a stepvector.
+ ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
+ if (DataVT.isScalableVector())
+ VScaleRange = getVScaleRange(I.getCaller(), 64);
+ unsigned EltWidth = TLI.getBitWidthForCttzElements(
+ I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true,
+ &VScaleRange);
+ MVT StepVT = MVT::getIntegerVT(EltWidth);
+ EVT StepVecVT = DataVT.changeVectorElementType(StepVT);
+
+ // Zero out lanes with inactive elements, then find the highest remaining
+ // value from the stepvector.
+ SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT);
+ SDValue StepVec = DAG.getStepVector(sdl, StepVecVT);
+ SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes);
SDValue HighestIdx =
- DAG.getNode(ISD::VECREDUCE_UMAX, sdl, IdxVT, ActiveElts);
+ DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts);
+
+ // Extract the corresponding lane from the data vector
+ EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+ SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT);
SDValue Extract =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, HighestIdx);
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx);
+
+ // If all mask lanes were inactive, choose the passthru value instead.
SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru);
setValue(&I, Result);
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d0e0da53307cf8..e73538da282e99 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1119,6 +1119,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
if (Name.consume_front("experimental.vector.")) {
Intrinsic::ID ID =
StringSwitch<Intrinsic::ID>(Name)
+ // Skip over extract.last.active, otherwise it will be 'upgraded'
+ // to a regular vector extract which is a different operation.
+ .StartsWith("extract.last.active.", Intrinsic::not_intrinsic)
.StartsWith("extract.", Intrinsic::vector_extract)
.StartsWith("insert.", Intrinsic::vector_insert)
.StartsWith("splice.", Intrinsic::vector_splice)
diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
new file mode 100644
index 00000000000000..c0f1720e1cf8b3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
@@ -0,0 +1,420 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,NEON-FIXED
+; RUN: llc -mtriple=aarch64 -mattr=+sve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SVE-FIXED
+
+define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i8:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: cmeq v2.16b, v1.16b, #0
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_0
+; NEON-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b
+; NEON-FIXED-NEXT: ldr q3, [x8, :lo12:.LCPI0_0]
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: bic v2.16b, v3.16b, v2.16b
+; NEON-FIXED-NEXT: umaxv b1, v1.16b
+; NEON-FIXED-NEXT: umaxv b2, v2.16b
+; NEON-FIXED-NEXT: fmov w8, s2
+; NEON-FIXED-NEXT: bfxil x9, x8, #0, #4
+; NEON-FIXED-NEXT: ldrb w8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_i8:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: index z2.b, #0, #1
+; SVE-FIXED-NEXT: cmeq v3.16b, v1.16b, #0
+; SVE-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: bic v2.16b, v2.16b, v3.16b
+; SVE-FIXED-NEXT: umaxv b1, v1.16b
+; SVE-FIXED-NEXT: umaxv b2, v2.16b
+; SVE-FIXED-NEXT: fmov w8, s2
+; SVE-FIXED-NEXT: bfxil x9, x8, #0, #4
+; SVE-FIXED-NEXT: ldrb w8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %notzero = icmp ne <16 x i8> %mask, zeroinitializer
+ %res = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %notzero, i8 %passthru)
+ ret i8 %res
+}
+
+define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i16:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h
+; NEON-FIXED-NEXT: adrp x8, .LCPI1_0
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI1_0]
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: xtn v1.8b, v1.8h
+; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; NEON-FIXED-NEXT: umaxv b1, v1.8b
+; NEON-FIXED-NEXT: umaxv b2, v2.8b
+; NEON-FIXED-NEXT: fmov w8, s2
+; NEON-FIXED-NEXT: bfi x9, x8, #1, #3
+; NEON-FIXED-NEXT: ldrh w8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_i16:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h
+; SVE-FIXED-NEXT: index z2.b, #0, #1
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: xtn v1.8b, v1.8h
+; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; SVE-FIXED-NEXT: umaxv b1, v1.8b
+; SVE-FIXED-NEXT: umaxv b2, v2.8b
+; SVE-FIXED-NEXT: fmov w8, s2
+; SVE-FIXED-NEXT: bfi x9, x8, #1, #3
+; SVE-FIXED-NEXT: ldrh w8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %notzero = icmp ne <8 x i16> %mask, zeroinitializer
+ %res = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> %data, <8 x i1> %notzero, i16 %passthru)
+ ret i16 %res
+}
+
+define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i32:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
+; NEON-FIXED-NEXT: adrp x8, .LCPI2_0
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI2_0]
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: xtn v1.4h, v1.4s
+; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; NEON-FIXED-NEXT: umaxv h1, v1.4h
+; NEON-FIXED-NEXT: umaxv h2, v2.4h
+; NEON-FIXED-NEXT: fmov w8, s2
+; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
+; NEON-FIXED-NEXT: ldr w8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_i32:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
+; SVE-FIXED-NEXT: index z2.h, #0, #1
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: xtn v1.4h, v1.4s
+; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; SVE-FIXED-NEXT: umaxv h1, v1.4h
+; SVE-FIXED-NEXT: umaxv h2, v2.4h
+; SVE-FIXED-NEXT: fmov w8, s2
+; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
+; SVE-FIXED-NEXT: ldr w8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %notzero = icmp ne <4 x i32> %mask, zeroinitializer
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %notzero, i32 %passthru)
+ ret i32 %res
+}
+
+define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i64:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
+; NEON-FIXED-NEXT: adrp x8, .LCPI3_0
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI3_0]
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: xtn v1.2s, v1.2d
+; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; NEON-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; NEON-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s
+; NEON-FIXED-NEXT: fmov w8, s2
+; NEON-FIXED-NEXT: bfi x9, x8, #3, #1
+; NEON-FIXED-NEXT: ldr x8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel x0, x8, x0, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_i64:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
+; SVE-FIXED-NEXT: index z2.s, #0, #1
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: xtn v1.2s, v1.2d
+; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; SVE-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; SVE-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s
+; SVE-FIXED-NEXT: fmov w8, s2
+; SVE-FIXED-NEXT: bfi x9, x8, #3, #1
+; SVE-FIXED-NEXT: ldr x8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel x0, x8, x0, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %notzero = icmp ne <2 x i64> %mask, zeroinitializer
+ %res = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> %data, <2 x i1> %notzero, i64 %passthru)
+ ret i64 %res
+}
+
+define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %passthru) {
+; NEON-FIXED-LABEL: extract_last_float:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
+; NEON-FIXED-NEXT: adrp x8, .LCPI4_0
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI4_0]
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: xtn v1.4h, v1.4s
+; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
+; NEON-FIXED-NEXT: umaxv h1, v1.4h
+; NEON-FIXED-NEXT: umaxv h3, v3.4h
+; NEON-FIXED-NEXT: fmov w8, s3
+; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
+; NEON-FIXED-NEXT: fmov w8, s1
+; NEON-FIXED-NEXT: ldr s0, [x9]
+; NEON-FIXED-NEXT: tst w8, #0x1
+; NEON-FIXED-NEXT: fcsel s0, s0, s2, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_float:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
+; SVE-FIXED-NEXT: index z3.h, #0, #1
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: xtn v1.4h, v1.4s
+; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
+; SVE-FIXED-NEXT: umaxv h1, v1.4h
+; SVE-FIXED-NEXT: umaxv h3, v3.4h
+; SVE-FIXED-NEXT: fmov w8, s3
+; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
+; SVE-FIXED-NEXT: fmov w8, s1
+; SVE-FIXED-NEXT: ldr s0, [x9]
+; SVE-FIXED-NEXT: tst w8, #0x1
+; SVE-FIXED-NEXT: fcsel s0, s0, s2, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %notzero = icmp ne <4 x i32> %mask, zeroinitializer
+ %res = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> %data, <4 x i1> %notzero, float %passthru)
+ ret float %res
+}
+
+define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %passthru) {
+; NEON-FIXED-LABEL: extract_last_double:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
+; NEON-FIXED-NEXT: adrp x8, .LCPI5_0
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI5_0]
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: xtn v1.2s, v1.2d
+; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
+; NEON-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; NEON-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s
+; NEON-FIXED-NEXT: fmov w8, s3
+; NEON-FIXED-NEXT: bfi x9, x8, #3, #1
+; NEON-FIXED-NEXT: fmov w8, s1
+; NEON-FIXED-NEXT: ldr d0, [x9]
+; NEON-FIXED-NEXT: tst w8, #0x1
+; NEON-FIXED-NEXT: fcsel d0, d0, d2, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_double:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
+; SVE-FIXED-NEXT: index z3.s, #0, #1
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: xtn v1.2s, v1.2d
+; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
+; SVE-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; SVE-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s
+; SVE-FIXED-NEXT: fmov w8, s3
+; SVE-FIXED-NEXT: bfi x9, x8, #3, #1
+; SVE-FIXED-NEXT: fmov w8, s1
+; SVE-FIXED-NEXT: ldr d0, [x9]
+; SVE-FIXED-NEXT: tst w8, #0x1
+; SVE-FIXED-NEXT: fcsel d0, d0, d2, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %notzero = icmp ne <2 x i64> %mask, zeroinitializer
+ %res = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> %data, <2 x i1> %notzero, double %passthru)
+ ret double %res
+}
+
+define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) #0 {
+; CHECK-LABEL: extract_last_i8_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z1.b, #0, #1
+; CHECK-NEXT: mov z2.b, #0 // =0x0
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: sel z1.b, p0, z1.b, z2.b
+; CHECK-NEXT: umaxv b1, p1, z1.b
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: whilels p1.b, xzr, x8
+; CHECK-NEXT: ptest p0, p0.b
+; CHECK-NEXT: lastb w8, p1, z0.b
+; CHECK-NEXT: csel w0, w8, w0, ne
+; CHECK-NEXT: ret
+ %res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru)
+ ret i8 %res
+}
+
+define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) #0 {
+; CHECK-LABEL: extract_last_i16_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z1.h, #0, #1
+; CHECK-NEXT: mov z2.h, #0 // =0x0
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: sel z1.h, p0, z1.h, z2.h
+; CHECK-NEXT: umaxv h1, p1, z1.h
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: whilels p2.h, xzr, x8
+; CHECK-NEXT: ptest p1, p0.b
+; CHECK-NEXT: lastb w8, p2, z0.h
+; CHECK-NEXT: csel w0, w8, w0, ne
+; CHECK-NEXT: ret
+ %res = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
+ ret i16 %res
+}
+
+define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru) #0 {
+; CHECK-LABEL: extract_last_i32_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z1.s, #0, #1
+; CHECK-NEXT: mov z2.s, #0 // =0x0
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: sel z1.s, p0, z1.s, z2.s
+; CHECK-NEXT: umaxv s1, p1, z1.s
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: whilels p2.s, xzr, x8
+; CHECK-NEXT: ptest p1, p0.b
+; CHECK-NEXT: lastb w8, p2, z0.s
+; CHECK-NEXT: csel w0, w8, w0, ne
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru)
+ ret i32 %res
+}
+
+define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) #0 {
+; CHECK-LABEL: extract_last_i64_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z1.d, #0, #1
+; CHECK-NEXT: mov z2.d, #0 // =0x0
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sel z1.d, p0, z1.d, z2.d
+; CHECK-NEXT: umaxv d1, p1, z1.d
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: whilels p2.d, xzr, x8
+; CHECK-NEXT: ptest p1, p0.b
+; CHECK-NEXT: lastb x8, p2, z0.d
+; CHECK-NEXT: csel x0, x8, x0, ne
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru)
+ ret i64 %res
+}
+
+define float @extract_last_float_scalable(<vscale x 4 x float> %data, <vscale x 4 x i1> %mask, float %passthru) #0 {
+; CHECK-LABEL: extract_last_float_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z2.s, #0, #1
+; CHECK-NEXT: mov z3.s, #0 // =0x0
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: sel z2.s, p0, z2.s, z3.s
+; CHECK-NEXT: umaxv s2, p1, z2.s
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: whilels p2.s, xzr, x8
+; CHECK-NEXT: ptest p1, p0.b
+; CHECK-NEXT: lastb s0, p2, z0.s
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: ret
+ %res = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x i1> %mask, float %passthru)
+ ret float %res
+}
+
+define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru) #0 {
+; CHECK-LABEL: extract_last_double_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z2.d, #0, #1
+; CHECK-NEXT: mov z3.d, #0 // =0x0
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sel z2.d, p0, z2.d, z3.d
+; CHECK-NEXT: umaxv d2, p1, z2.d
+; CHECK-NEXT: fmov x8, d2
+; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: whilels p2.d, xzr, x8
+; CHECK-NEXT: ptest p1, p0.b
+; CHECK-NEXT: lastb d0, p2, z0.d
+; CHECK-NEXT: fcsel d0, d0, d1, ne
+; CHECK-NEXT: ret
+ %res = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru)
+ ret double %res
+}
+
+declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
+declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
+declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
+declare i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64>, <2 x i1>, i64)
+declare float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float>, <4 x i1>, float)
+declare double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double>, <2 x i1>, double)
+declare i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
+declare i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
+declare i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
+declare float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float)
+declare double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
+
+attributes #0 = { "target-features"="+sve" vscale_range(1, 16) }
diff --git a/llvm/test/CodeGen/AArch64/vector-masked-extract.ll b/llvm/test/CodeGen/AArch64/vector-masked-extract.ll
deleted file mode 100644
index 04adf4e476b041..00000000000000
--- a/llvm/test/CodeGen/AArch64/vector-masked-extract.ll
+++ /dev/null
@@ -1,663 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,NEON-FIXED
-; RUN: llc -mtriple=aarch64 -mattr=+sve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SVE-FIXED
-
-define i8 @extract_last_i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru) {
-; NEON-FIXED-LABEL: extract_last_i8:
-; NEON-FIXED: // %bb.0:
-; NEON-FIXED-NEXT: sub sp, sp, #16
-; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
-; NEON-FIXED-NEXT: umov w15, v1.b[14]
-; NEON-FIXED-NEXT: umov w14, v1.b[6]
-; NEON-FIXED-NEXT: adrp x8, .LCPI0_0
-; NEON-FIXED-NEXT: umov w12, v1.b[15]
-; NEON-FIXED-NEXT: umov w13, v1.b[10]
-; NEON-FIXED-NEXT: ldr q2, [x8, :lo12:.LCPI0_0]
-; NEON-FIXED-NEXT: umov w11, v1.b[2]
-; NEON-FIXED-NEXT: umov w8, v1.b[7]
-; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: umov w9, v1.b[11]
-; NEON-FIXED-NEXT: umov w10, v1.b[3]
-; NEON-FIXED-NEXT: umov w16, v1.b[12]
-; NEON-FIXED-NEXT: fmov s3, w15
-; NEON-FIXED-NEXT: umov w15, v1.b[4]
-; NEON-FIXED-NEXT: fmov s4, w14
-; NEON-FIXED-NEXT: fmov s5, w13
-; NEON-FIXED-NEXT: umov w13, v1.b[0]
-; NEON-FIXED-NEXT: umov w14, v1.b[13]
-; NEON-FIXED-NEXT: fmov s6, w11
-; NEON-FIXED-NEXT: umov w11, v1.b[5]
-; NEON-FIXED-NEXT: mov v3.s[1], w12
-; NEON-FIXED-NEXT: umov w12, v1.b[8]
-; NEON-FIXED-NEXT: mov v4.s[1], w8
-; NEON-FIXED-NEXT: umov w8, v1.b[9]
-; NEON-FIXED-NEXT: mov v5.s[1], w9
-; NEON-FIXED-NEXT: umov w9, v1.b[1]
-; NEON-FIXED-NEXT: fmov s7, w16
-; NEON-FIXED-NEXT: fmov s16, w15
-; NEON-FIXED-NEXT: mov v6.s[1], w10
-; NEON-FIXED-NEXT: fmov s18, w13
-; NEON-FIXED-NEXT: shl v1.16b, v1.16b, #7
-; NEON-FIXED-NEXT: fmov s17, w12
-; NEON-FIXED-NEXT: ushll v3.2d, v3.2s, #0
-; NEON-FIXED-NEXT: ushll v4.2d, v4.2s, #0
-; NEON-FIXED-NEXT: mov v7.s[1], w14
-; NEON-FIXED-NEXT: mov v16.s[1], w11
-; NEON-FIXED-NEXT: ushll v5.2d, v5.2s, #0
-; NEON-FIXED-NEXT: mov v18.s[1], w9
-; NEON-FIXED-NEXT: adrp x9, .LCPI0_2
-; NEON-FIXED-NEXT: ushll v6.2d, v6.2s, #0
-; NEON-FIXED-NEXT: ldr q20, [x9, :lo12:.LCPI0_2]
-; NEON-FIXED-NEXT: adrp x9, .LCPI0_7
-; NEON-FIXED-NEXT: mov v17.s[1], w8
-; NEON-FIXED-NEXT: adrp x8, .LCPI0_1
-; NEON-FIXED-NEXT: ldr q23, [x9, :lo12:.LCPI0_7]
-; NEON-FIXED-NEXT: mov x9, sp
-; NEON-FIXED-NEXT: ldr q19, [x8, :lo12:.LCPI0_1]
-; NEON-FIXED-NEXT: adrp x8, .LCPI0_3
-; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63
-; NEON-FIXED-NEXT: shl v4.2d, v4.2d, #63
-; NEON-FIXED-NEXT: ushll v7.2d, v7.2s, #0
-; NEON-FIXED-NEXT: shl v5.2d, v5.2d, #63
-; NEON-FIXED-NEXT: ushll v16.2d, v16.2s, #0
-; NEON-FIXED-NEXT: ushll v17.2d, v17.2s, #0
-; NEON-FIXED-NEXT: shl v6.2d, v6.2d, #63
-; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
-; NEON-FIXED-NEXT: ushll v18.2d, v18.2s, #0
-; NEON-FIXED-NEXT: cmlt v1.16b, v1.16b, #0
-; NEON-FIXED-NEXT: cmlt v4.2d, v4.2d, #0
-; NEON-FIXED-NEXT: cmlt v5.2d, v5.2d, #0
-; NEON-FIXED-NEXT: cmlt v6.2d, v6.2d, #0
-; NEON-FIXED-NEXT: and v2.16b, v3.16b, v2.16b
-; NEON-FIXED-NEXT: shl v3.2d, v7.2d, #63
-; NEON-FIXED-NEXT: shl v7.2d, v16.2d, #63
-; NEON-FIXED-NEXT: shl v16.2d, v17.2d, #63
-; NEON-FIXED-NEXT: ldr q17, [x8, :lo12:.LCPI0_3]
-; NEON-FIXED-NEXT: adrp x8, .LCPI0_4
-; NEON-FIXED-NEXT: ldr q21, [x8, :lo12:.LCPI0_4]
-; NEON-FIXED-NEXT: adrp x8, .LCPI0_5
-; NEON-FIXED-NEXT: shl v18.2d, v18.2d, #63
-; NEON-FIXED-NEXT: ldr q22, [x8, :lo12:.LCPI0_5]
-; NEON-FIXED-NEXT: adrp x8, .LCPI0_6
-; NEON-FIXED-NEXT: and v4.16b, v4.16b, v19.16b
-; NEON-FIXED-NEXT: ldr q19, [x8, :lo12:.LCPI0_6]
-; NEON-FIXED-NEXT: cmlt v16.2d, v16.2d, #0
-; NEON-FIXED-NEXT: and v5.16b, v5.16b, v20.16b
-; NEON-FIXED-NEXT: cmlt v18.2d, v18.2d, #0
-; NEON-FIXED-NEXT: and v6.16b, v6.16b, v17.16b
-; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
-; NEON-FIXED-NEXT: cmlt v7.2d, v7.2d, #0
-; NEON-FIXED-NEXT: umaxv b1, v1.16b
-; NEON-FIXED-NEXT: and v16.16b, v16.16b, v19.16b
-; NEON-FIXED-NEXT: and v17.16b, v18.16b, v23.16b
-; NEON-FIXED-NEXT: cmhi v18.2d, v4.2d, v2.2d
-; NEON-FIXED-NEXT: cmhi v19.2d, v6.2d, v5.2d
-; NEON-FIXED-NEXT: and v3.16b, v3.16b, v21.16b
-; NEON-FIXED-NEXT: and v7.16b, v7.16b, v22.16b
-; NEON-FIXED-NEXT: cmhi v21.2d, v17.2d, v16.2d
-; NEON-FIXED-NEXT: bit v2.16b, v4.16b, v18.16b
-; NEON-FIXED-NEXT: mov v4.16b, v19.16b
-; NEON-FIXED-NEXT: cmhi v20.2d, v7.2d, v3.2d
-; NEON-FIXED-NEXT: bsl v4.16b, v6.16b, v5.16b
-; NEON-FIXED-NEXT: mov v5.16b, v21.16b
-; NEON-FIXED-NEXT: bit v3.16b, v7.16b, v20.16b
-; NEON-FIXED-NEXT: bsl v5.16b, v17.16b, v16.16b
-; NEON-FIXED-NEXT: cmhi v6.2d, v4.2d, v2.2d
-; NEON-FIXED-NEXT: cmhi v7.2d, v5.2d, v3.2d
-; NEON-FIXED-NEXT: bit v2.16b, v4.16b, v6.16b
-; NEON-FIXED-NEXT: bit v3.16b, v5.16b, v7.16b
-; NEON-FIXED-NEXT: cmhi v4.2d, v3.2d, v2.2d
-; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v4.16b
-; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
-; NEON-FIXED-NEXT: cmhi d4, d2, d3
-; NEON-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b
-; NEON-FIXED-NEXT: fmov x8, d2
-; NEON-FIXED-NEXT: bfxil x9, x8, #0, #4
-; NEON-FIXED-NEXT: ldrb w8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
-; NEON-FIXED-NEXT: csel w0, w8, w0, ne
-; NEON-FIXED-NEXT: add sp, sp, #16
-; NEON-FIXED-NEXT: ret
-;
-; SVE-FIXED-LABEL: extract_last_i8:
-; SVE-FIXED: // %bb.0:
-; SVE-FIXED-NEXT: sub sp, sp, #16
-; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
-; SVE-FIXED-NEXT: umov w8, v1.b[14]
-; SVE-FIXED-NEXT: umov w9, v1.b[6]
-; SVE-FIXED-NEXT: index z2.d, #0, #1
-; SVE-FIXED-NEXT: umov w12, v1.b[2]
-; SVE-FIXED-NEXT: umov w10, v1.b[10]
-; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: umov w13, v1.b[12]
-; SVE-FIXED-NEXT: umov w11, v1.b[15]
-; SVE-FIXED-NEXT: umov w14, v1.b[4]
-; SVE-FIXED-NEXT: umov w16, v1.b[0]
-; SVE-FIXED-NEXT: umov w15, v1.b[8]
-; SVE-FIXED-NEXT: fmov s3, w8
-; SVE-FIXED-NEXT: umov w8, v1.b[7]
-; SVE-FIXED-NEXT: fmov s4, w9
-; SVE-FIXED-NEXT: umov w9, v1.b[11]
-; SVE-FIXED-NEXT: fmov s6, w12
-; SVE-FIXED-NEXT: umov w12, v1.b[3]
-; SVE-FIXED-NEXT: fmov s5, w10
-; SVE-FIXED-NEXT: umov w10, v1.b[1]
-; SVE-FIXED-NEXT: fmov s7, w13
-; SVE-FIXED-NEXT: umov w13, v1.b[13]
-; SVE-FIXED-NEXT: fmov s16, w14
-; SVE-FIXED-NEXT: fmov s18, w16
-; SVE-FIXED-NEXT: mov v4.s[1], w8
-; SVE-FIXED-NEXT: umov w8, v1.b[5]
-; SVE-FIXED-NEXT: mov v3.s[1], w11
-; SVE-FIXED-NEXT: mov v5.s[1], w9
-; SVE-FIXED-NEXT: mov v6.s[1], w12
-; SVE-FIXED-NEXT: umov w9, v1.b[9]
-; SVE-FIXED-NEXT: fmov s17, w15
-; SVE-FIXED-NEXT: mov v18.s[1], w10
-; SVE-FIXED-NEXT: mov z19.d, z2.d
-; SVE-FIXED-NEXT: mov v7.s[1], w13
-; SVE-FIXED-NEXT: mov z20.d, z2.d
-; SVE-FIXED-NEXT: mov z21.d, z2.d
-; SVE-FIXED-NEXT: mov v16.s[1], w8
-; SVE-FIXED-NEXT: ushll v3.2d, v3.2s, #0
-; SVE-FIXED-NEXT: ushll v4.2d, v4.2s, #0
-; SVE-FIXED-NEXT: ushll v5.2d, v5.2s, #0
-; SVE-FIXED-NEXT: ushll v6.2d, v6.2s, #0
-; SVE-FIXED-NEXT: mov v17.s[1], w9
-; SVE-FIXED-NEXT: mov x9, sp
-; SVE-FIXED-NEXT: ushll v18.2d, v18.2s, #0
-; SVE-FIXED-NEXT: mov z25.d, z2.d
-; SVE-FIXED-NEXT: ushll v7.2d, v7.2s, #0
-; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63
-; SVE-FIXED-NEXT: shl v4.2d, v4.2d, #63
-; SVE-FIXED-NEXT: ushll v16.2d, v16.2s, #0
-; SVE-FIXED-NEXT: shl v5.2d, v5.2d, #63
-; SVE-FIXED-NEXT: shl v6.2d, v6.2d, #63
-; SVE-FIXED-NEXT: mov z22.d, z2.d
-; SVE-FIXED-NEXT: mov z23.d, z2.d
-; SVE-FIXED-NEXT: add z19.d, z19.d, #6 // =0x6
-; SVE-FIXED-NEXT: shl v18.2d, v18.2d, #63
-; SVE-FIXED-NEXT: ushll v17.2d, v17.2s, #0
-; SVE-FIXED-NEXT: shl v7.2d, v7.2d, #63
-; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
-; SVE-FIXED-NEXT: cmlt v4.2d, v4.2d, #0
-; SVE-FIXED-NEXT: add z25.d, z25.d, #14 // =0xe
-; SVE-FIXED-NEXT: shl v16.2d, v16.2d, #63
-; SVE-FIXED-NEXT: cmlt v5.2d, v5.2d, #0
-; SVE-FIXED-NEXT: add z20.d, z20.d, #10 // =0xa
-; SVE-FIXED-NEXT: cmlt v6.2d, v6.2d, #0
-; SVE-FIXED-NEXT: add z21.d, z21.d, #2 // =0x2
-; SVE-FIXED-NEXT: mov z24.d, z2.d
-; SVE-FIXED-NEXT: shl v17.2d, v17.2d, #63
-; SVE-FIXED-NEXT: cmlt v18.2d, v18.2d, #0
-; SVE-FIXED-NEXT: cmlt v7.2d, v7.2d, #0
-; SVE-FIXED-NEXT: add z22.d, z22.d, #12 // =0xc
-; SVE-FIXED-NEXT: cmlt v16.2d, v16.2d, #0
-; SVE-FIXED-NEXT: add z23.d, z23.d, #4 // =0x4
-; SVE-FIXED-NEXT: and v3.16b, v3.16b, v25.16b
-; SVE-FIXED-NEXT: and v4.16b, v4.16b, v19.16b
-; SVE-FIXED-NEXT: and v5.16b, v5.16b, v20.16b
-; SVE-FIXED-NEXT: and v6.16b, v6.16b, v21.16b
-; SVE-FIXED-NEXT: cmlt v17.2d, v17.2d, #0
-; SVE-FIXED-NEXT: add z24.d, z24.d, #8 // =0x8
-; SVE-FIXED-NEXT: and v2.16b, v18.16b, v2.16b
-; SVE-FIXED-NEXT: and v7.16b, v7.16b, v22.16b
-; SVE-FIXED-NEXT: and v16.16b, v16.16b, v23.16b
-; SVE-FIXED-NEXT: cmhi v18.2d, v4.2d, v3.2d
-; SVE-FIXED-NEXT: shl v1.16b, v1.16b, #7
-; SVE-FIXED-NEXT: cmhi v19.2d, v6.2d, v5.2d
-; SVE-FIXED-NEXT: and v17.16b, v17.16b, v24.16b
-; SVE-FIXED-NEXT: cmhi v20.2d, v16.2d, v7.2d
-; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v18.16b
-; SVE-FIXED-NEXT: cmlt v1.16b, v1.16b, #0
-; SVE-FIXED-NEXT: mov v4.16b, v19.16b
-; SVE-FIXED-NEXT: cmhi v21.2d, v2.2d, v17.2d
-; SVE-FIXED-NEXT: umaxv b1, v1.16b
-; SVE-FIXED-NEXT: bsl v4.16b, v6.16b, v5.16b
-; SVE-FIXED-NEXT: mov v5.16b, v20.16b
-; SVE-FIXED-NEXT: bif v2.16b, v17.16b, v21.16b
-; SVE-FIXED-NEXT: bsl v5.16b, v16.16b, v7.16b
-; SVE-FIXED-NEXT: cmhi v6.2d, v4.2d, v3.2d
-; SVE-FIXED-NEXT: cmhi v7.2d, v2.2d, v5.2d
-; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v6.16b
-; SVE-FIXED-NEXT: bif v2.16b, v5.16b, v7.16b
-; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d
-; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b
-; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
-; SVE-FIXED-NEXT: cmhi d4, d2, d3
-; SVE-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b
-; SVE-FIXED-NEXT: fmov x8, d2
-; SVE-FIXED-NEXT: bfxil x9, x8, #0, #4
-; SVE-FIXED-NEXT: ldrb w8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
-; SVE-FIXED-NEXT: csel w0, w8, w0, ne
-; SVE-FIXED-NEXT: add sp, sp, #16
-; SVE-FIXED-NEXT: ret
- %res = call i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru)
- ret i8 %res
-}
-
-define i16 @extract_last_i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru) {
-; NEON-FIXED-LABEL: extract_last_i16:
-; NEON-FIXED: // %bb.0:
-; NEON-FIXED-NEXT: sub sp, sp, #16
-; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
-; NEON-FIXED-NEXT: // kill: def $d1 killed $d1 def $q1
-; NEON-FIXED-NEXT: umov w8, v1.b[6]
-; NEON-FIXED-NEXT: umov w9, v1.b[2]
-; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: umov w11, v1.b[4]
-; NEON-FIXED-NEXT: umov w12, v1.b[0]
-; NEON-FIXED-NEXT: umov w10, v1.b[7]
-; NEON-FIXED-NEXT: umov w13, v1.b[3]
-; NEON-FIXED-NEXT: umov w14, v1.b[5]
-; NEON-FIXED-NEXT: umov w15, v1.b[1]
-; NEON-FIXED-NEXT: shl v1.8b, v1.8b, #7
-; NEON-FIXED-NEXT: fmov s2, w8
-; NEON-FIXED-NEXT: adrp x8, .LCPI1_0
-; NEON-FIXED-NEXT: fmov s3, w9
-; NEON-FIXED-NEXT: fmov s4, w11
-; NEON-FIXED-NEXT: adrp x9, .LCPI1_1
-; NEON-FIXED-NEXT: ldr q6, [x8, :lo12:.LCPI1_0]
-; NEON-FIXED-NEXT: fmov s5, w12
-; NEON-FIXED-NEXT: adrp x8, .LCPI1_3
-; NEON-FIXED-NEXT: ldr q7, [x9, :lo12:.LCPI1_1]
-; NEON-FIXED-NEXT: mov v2.s[1], w10
-; NEON-FIXED-NEXT: mov v3.s[1], w13
-; NEON-FIXED-NEXT: adrp x10, .LCPI1_2
-; NEON-FIXED-NEXT: mov v4.s[1], w14
-; NEON-FIXED-NEXT: ldr q16, [x10, :lo12:.LCPI1_2]
-; NEON-FIXED-NEXT: ldr q17, [x8, :lo12:.LCPI1_3]
-; NEON-FIXED-NEXT: mov v5.s[1], w15
-; NEON-FIXED-NEXT: cmlt v1.8b, v1.8b, #0
-; NEON-FIXED-NEXT: mov x9, sp
-; NEON-FIXED-NEXT: ushll v2.2d, v2.2s, #0
-; NEON-FIXED-NEXT: ushll v3.2d, v3.2s, #0
-; NEON-FIXED-NEXT: ushll v4.2d, v4.2s, #0
-; NEON-FIXED-NEXT: umaxv b1, v1.8b
-; NEON-FIXED-NEXT: ushll v5.2d, v5.2s, #0
-; NEON-FIXED-NEXT: shl v2.2d, v2.2d, #63
-; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63
-; NEON-FIXED-NEXT: shl v4.2d, v4.2d, #63
-; NEON-FIXED-NEXT: shl v5.2d, v5.2d, #63
-; NEON-FIXED-NEXT: cmlt v2.2d, v2.2d, #0
-; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
-; NEON-FIXED-NEXT: cmlt v4.2d, v4.2d, #0
-; NEON-FIXED-NEXT: cmlt v5.2d, v5.2d, #0
-; NEON-FIXED-NEXT: and v2.16b, v2.16b, v6.16b
-; NEON-FIXED-NEXT: and v3.16b, v3.16b, v7.16b
-; NEON-FIXED-NEXT: and v4.16b, v4.16b, v16.16b
-; NEON-FIXED-NEXT: and v5.16b, v5.16b, v17.16b
-; NEON-FIXED-NEXT: cmhi v6.2d, v3.2d, v2.2d
-; NEON-FIXED-NEXT: cmhi v7.2d, v5.2d, v4.2d
-; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v6.16b
-; NEON-FIXED-NEXT: mov v3.16b, v7.16b
-; NEON-FIXED-NEXT: bsl v3.16b, v5.16b, v4.16b
-; NEON-FIXED-NEXT: cmhi v4.2d, v3.2d, v2.2d
-; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v4.16b
-; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
-; NEON-FIXED-NEXT: cmhi d4, d2, d3
-; NEON-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b
-; NEON-FIXED-NEXT: fmov x8, d2
-; NEON-FIXED-NEXT: bfi x9, x8, #1, #3
-; NEON-FIXED-NEXT: ldrh w8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
-; NEON-FIXED-NEXT: csel w0, w8, w0, ne
-; NEON-FIXED-NEXT: add sp, sp, #16
-; NEON-FIXED-NEXT: ret
-;
-; SVE-FIXED-LABEL: extract_last_i16:
-; SVE-FIXED: // %bb.0:
-; SVE-FIXED-NEXT: sub sp, sp, #16
-; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
-; SVE-FIXED-NEXT: // kill: def $d1 killed $d1 def $q1
-; SVE-FIXED-NEXT: umov w8, v1.b[0]
-; SVE-FIXED-NEXT: umov w10, v1.b[6]
-; SVE-FIXED-NEXT: index z6.d, #0, #1
-; SVE-FIXED-NEXT: umov w11, v1.b[2]
-; SVE-FIXED-NEXT: umov w14, v1.b[4]
-; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: umov w9, v1.b[1]
-; SVE-FIXED-NEXT: umov w12, v1.b[7]
-; SVE-FIXED-NEXT: umov w13, v1.b[3]
-; SVE-FIXED-NEXT: fmov s2, w8
-; SVE-FIXED-NEXT: umov w8, v1.b[5]
-; SVE-FIXED-NEXT: fmov s3, w10
-; SVE-FIXED-NEXT: fmov s4, w11
-; SVE-FIXED-NEXT: fmov s5, w14
-; SVE-FIXED-NEXT: mov z7.d, z6.d
-; SVE-FIXED-NEXT: mov z16.d, z6.d
-; SVE-FIXED-NEXT: mov z17.d, z6.d
-; SVE-FIXED-NEXT: shl v1.8b, v1.8b, #7
-; SVE-FIXED-NEXT: mov v2.s[1], w9
-; SVE-FIXED-NEXT: mov x9, sp
-; SVE-FIXED-NEXT: mov v3.s[1], w12
-; SVE-FIXED-NEXT: mov v4.s[1], w13
-; SVE-FIXED-NEXT: mov v5.s[1], w8
-; SVE-FIXED-NEXT: add z7.d, z7.d, #2 // =0x2
-; SVE-FIXED-NEXT: add z17.d, z17.d, #6 // =0x6
-; SVE-FIXED-NEXT: add z16.d, z16.d, #4 // =0x4
-; SVE-FIXED-NEXT: cmlt v1.8b, v1.8b, #0
-; SVE-FIXED-NEXT: ushll v2.2d, v2.2s, #0
-; SVE-FIXED-NEXT: ushll v3.2d, v3.2s, #0
-; SVE-FIXED-NEXT: ushll v4.2d, v4.2s, #0
-; SVE-FIXED-NEXT: ushll v5.2d, v5.2s, #0
-; SVE-FIXED-NEXT: umaxv b1, v1.8b
-; SVE-FIXED-NEXT: shl v2.2d, v2.2d, #63
-; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63
-; SVE-FIXED-NEXT: shl v4.2d, v4.2d, #63
-; SVE-FIXED-NEXT: shl v5.2d, v5.2d, #63
-; SVE-FIXED-NEXT: cmlt v2.2d, v2.2d, #0
-; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
-; SVE-FIXED-NEXT: cmlt v4.2d, v4.2d, #0
-; SVE-FIXED-NEXT: cmlt v5.2d, v5.2d, #0
-; SVE-FIXED-NEXT: and v2.16b, v2.16b, v6.16b
-; SVE-FIXED-NEXT: and v3.16b, v3.16b, v17.16b
-; SVE-FIXED-NEXT: and v4.16b, v4.16b, v7.16b
-; SVE-FIXED-NEXT: and v5.16b, v5.16b, v16.16b
-; SVE-FIXED-NEXT: cmhi v6.2d, v4.2d, v3.2d
-; SVE-FIXED-NEXT: cmhi v7.2d, v2.2d, v5.2d
-; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v6.16b
-; SVE-FIXED-NEXT: bif v2.16b, v5.16b, v7.16b
-; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d
-; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b
-; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
-; SVE-FIXED-NEXT: cmhi d4, d2, d3
-; SVE-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b
-; SVE-FIXED-NEXT: fmov x8, d2
-; SVE-FIXED-NEXT: bfi x9, x8, #1, #3
-; SVE-FIXED-NEXT: ldrh w8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
-; SVE-FIXED-NEXT: csel w0, w8, w0, ne
-; SVE-FIXED-NEXT: add sp, sp, #16
-; SVE-FIXED-NEXT: ret
- %res = call i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru)
- ret i16 %res
-}
-
-define i32 @extract_last_i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru) {
-; NEON-FIXED-LABEL: extract_last_i32:
-; NEON-FIXED: // %bb.0:
-; NEON-FIXED-NEXT: sub sp, sp, #16
-; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
-; NEON-FIXED-NEXT: ushll v2.4s, v1.4h, #0
-; NEON-FIXED-NEXT: adrp x8, .LCPI2_0
-; NEON-FIXED-NEXT: adrp x9, .LCPI2_1
-; NEON-FIXED-NEXT: ldr q4, [x8, :lo12:.LCPI2_0]
-; NEON-FIXED-NEXT: ldr q5, [x9, :lo12:.LCPI2_1]
-; NEON-FIXED-NEXT: shl v1.4h, v1.4h, #15
-; NEON-FIXED-NEXT: mov x9, sp
-; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: ushll2 v3.2d, v2.4s, #0
-; NEON-FIXED-NEXT: ushll v2.2d, v2.2s, #0
-; NEON-FIXED-NEXT: cmlt v1.4h, v1.4h, #0
-; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63
-; NEON-FIXED-NEXT: shl v2.2d, v2.2d, #63
-; NEON-FIXED-NEXT: umaxv h1, v1.4h
-; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
-; NEON-FIXED-NEXT: cmlt v2.2d, v2.2d, #0
-; NEON-FIXED-NEXT: and v3.16b, v3.16b, v4.16b
-; NEON-FIXED-NEXT: and v2.16b, v2.16b, v5.16b
-; NEON-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d
-; NEON-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b
-; NEON-FIXED-NEXT: bic v3.16b, v3.16b, v4.16b
-; NEON-FIXED-NEXT: ext v2.16b, v2.16b, v2.16b, #8
-; NEON-FIXED-NEXT: cmhi d4, d3, d2
-; NEON-FIXED-NEXT: bit v2.8b, v3.8b, v4.8b
-; NEON-FIXED-NEXT: fmov x8, d2
-; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
-; NEON-FIXED-NEXT: ldr w8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
-; NEON-FIXED-NEXT: csel w0, w8, w0, ne
-; NEON-FIXED-NEXT: add sp, sp, #16
-; NEON-FIXED-NEXT: ret
-;
-; SVE-FIXED-LABEL: extract_last_i32:
-; SVE-FIXED: // %bb.0:
-; SVE-FIXED-NEXT: sub sp, sp, #16
-; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
-; SVE-FIXED-NEXT: ushll v2.4s, v1.4h, #0
-; SVE-FIXED-NEXT: index z4.d, #0, #1
-; SVE-FIXED-NEXT: shl v1.4h, v1.4h, #15
-; SVE-FIXED-NEXT: mov x9, sp
-; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: ushll2 v3.2d, v2.4s, #0
-; SVE-FIXED-NEXT: ushll v2.2d, v2.2s, #0
-; SVE-FIXED-NEXT: cmlt v1.4h, v1.4h, #0
-; SVE-FIXED-NEXT: mov z5.d, z4.d
-; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63
-; SVE-FIXED-NEXT: shl v2.2d, v2.2d, #63
-; SVE-FIXED-NEXT: umaxv h1, v1.4h
-; SVE-FIXED-NEXT: add z5.d, z5.d, #2 // =0x2
-; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
-; SVE-FIXED-NEXT: cmlt v2.2d, v2.2d, #0
-; SVE-FIXED-NEXT: and v2.16b, v2.16b, v4.16b
-; SVE-FIXED-NEXT: and v3.16b, v3.16b, v5.16b
-; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d
-; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b
-; SVE-FIXED-NEXT: bic v3.16b, v3.16b, v4.16b
-; SVE-FIXED-NEXT: ext v2.16b, v2.16b, v2.16b, #8
-; SVE-FIXED-NEXT: cmhi d4, d3, d2
-; SVE-FIXED-NEXT: bit v2.8b, v3.8b, v4.8b
-; SVE-FIXED-NEXT: fmov x8, d2
-; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
-; SVE-FIXED-NEXT: ldr w8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
-; SVE-FIXED-NEXT: csel w0, w8, w0, ne
-; SVE-FIXED-NEXT: add sp, sp, #16
-; SVE-FIXED-NEXT: ret
- %res = call i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru)
- ret i32 %res
-}
-
-define i64 @extract_last_i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru) {
-; CHECK-LABEL: extract_last_i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ushll v3.2d, v1.2s, #0
-; CHECK-NEXT: mov w8, #1 // =0x1
-; CHECK-NEXT: fmov d2, xzr
-; CHECK-NEXT: fmov d4, x8
-; CHECK-NEXT: shl v1.2s, v1.2s, #31
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: shl v3.2d, v3.2d, #63
-; CHECK-NEXT: cmlt v1.2s, v1.2s, #0
-; CHECK-NEXT: cmlt v3.2d, v3.2d, #0
-; CHECK-NEXT: umaxp v1.2s, v1.2s, v1.2s
-; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT: and v3.8b, v3.8b, v4.8b
-; CHECK-NEXT: cmhi d2, d2, d3
-; CHECK-NEXT: bic v2.8b, v3.8b, v2.8b
-; CHECK-NEXT: fmov x8, d2
-; CHECK-NEXT: orr x8, x9, x8, lsl #3
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: ldr x8, [x8]
-; CHECK-NEXT: tst w9, #0x1
-; CHECK-NEXT: csel x0, x8, x0, ne
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ret
- %res = call i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru)
- ret i64 %res
-}
-
-define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) #0 {
-; CHECK-LABEL: extract_last_i8_scalable:
-; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: index z1.d, #0, #1
-; CHECK-NEXT: punpklo p2.h, p0.b
-; CHECK-NEXT: mov z3.d, #0 // =0x0
-; CHECK-NEXT: punpkhi p4.h, p0.b
-; CHECK-NEXT: punpklo p5.h, p2.b
-; CHECK-NEXT: punpkhi p1.h, p4.b
-; CHECK-NEXT: mov z2.d, z1.d
-; CHECK-NEXT: mov z5.d, z1.d
-; CHECK-NEXT: mov z6.d, z1.d
-; CHECK-NEXT: punpkhi p3.h, p2.b
-; CHECK-NEXT: punpklo p2.h, p4.b
-; CHECK-NEXT: incd z2.d
-; CHECK-NEXT: incd z5.d, all, mul #2
-; CHECK-NEXT: punpklo p4.h, p5.b
-; CHECK-NEXT: incd z6.d, all, mul #4
-; CHECK-NEXT: punpkhi p6.h, p1.b
-; CHECK-NEXT: punpkhi p7.h, p3.b
-; CHECK-NEXT: sel z1.d, p4, z1.d, z3.d
-; CHECK-NEXT: mov z4.d, z2.d
-; CHECK-NEXT: mov z7.d, z2.d
-; CHECK-NEXT: mov z25.d, z5.d
-; CHECK-NEXT: punpkhi p5.h, p5.b
-; CHECK-NEXT: punpkhi p4.h, p2.b
-; CHECK-NEXT: incd z4.d, all, mul #2
-; CHECK-NEXT: incd z25.d, all, mul #4
-; CHECK-NEXT: incd z7.d, all, mul #4
-; CHECK-NEXT: punpklo p3.h, p3.b
-; CHECK-NEXT: sel z2.d, p5, z2.d, z3.d
-; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: punpklo p2.h, p2.b
-; CHECK-NEXT: mov z24.d, z4.d
-; CHECK-NEXT: punpklo p1.h, p1.b
-; CHECK-NEXT: sel z5.d, p3, z5.d, z3.d
-; CHECK-NEXT: sel z4.d, p7, z4.d, z3.d
-; CHECK-NEXT: sel z6.d, p2, z6.d, z3.d
-; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: sel z25.d, p1, z25.d, z3.d
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: incd z24.d, all, mul #4
-; CHECK-NEXT: umax z1.d, p1/m, z1.d, z6.d
-; CHECK-NEXT: sel z24.d, p6, z24.d, z3.d
-; CHECK-NEXT: mov z3.d, p4/m, z7.d
-; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: umax z4.d, p1/m, z4.d, z24.d
-; CHECK-NEXT: umax z2.d, p1/m, z2.d, z3.d
-; CHECK-NEXT: movprfx z3, z5
-; CHECK-NEXT: umax z3.d, p1/m, z3.d, z25.d
-; CHECK-NEXT: umax z2.d, p1/m, z2.d, z4.d
-; CHECK-NEXT: umax z1.d, p1/m, z1.d, z3.d
-; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d
-; CHECK-NEXT: umaxv d1, p1, z1.d
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: whilels p1.b, xzr, x8
-; CHECK-NEXT: ptest p0, p0.b
-; CHECK-NEXT: lastb w8, p1, z0.b
-; CHECK-NEXT: csel w0, w8, w0, ne
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT: ret
- %res = call i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru)
- ret i8 %res
-}
-
-define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) #0 {
-; CHECK-LABEL: extract_last_i16_scalable:
-; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: index z1.d, #0, #1
-; CHECK-NEXT: punpkhi p1.h, p0.b
-; CHECK-NEXT: mov z5.d, #0 // =0x0
-; CHECK-NEXT: punpklo p2.h, p0.b
-; CHECK-NEXT: punpkhi p3.h, p1.b
-; CHECK-NEXT: punpkhi p4.h, p2.b
-; CHECK-NEXT: mov z2.d, z1.d
-; CHECK-NEXT: mov z3.d, z1.d
-; CHECK-NEXT: punpklo p1.h, p1.b
-; CHECK-NEXT: punpklo p2.h, p2.b
-; CHECK-NEXT: incd z2.d
-; CHECK-NEXT: incd z3.d, all, mul #2
-; CHECK-NEXT: sel z1.d, p2, z1.d, z5.d
-; CHECK-NEXT: mov z4.d, z2.d
-; CHECK-NEXT: sel z2.d, p4, z2.d, z5.d
-; CHECK-NEXT: sel z3.d, p1, z3.d, z5.d
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: incd z4.d, all, mul #2
-; CHECK-NEXT: umax z1.d, p1/m, z1.d, z3.d
-; CHECK-NEXT: sel z4.d, p3, z4.d, z5.d
-; CHECK-NEXT: umax z2.d, p1/m, z2.d, z4.d
-; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d
-; CHECK-NEXT: umaxv d1, p1, z1.d
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: whilels p1.h, xzr, x8
-; CHECK-NEXT: lastb w8, p1, z0.h
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: ptest p1, p0.b
-; CHECK-NEXT: csel w0, w8, w0, ne
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT: ret
- %res = call i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
- ret i16 %res
-}
-
-define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru) #0 {
-; CHECK-LABEL: extract_last_i32_scalable:
-; CHECK: // %bb.0:
-; CHECK-NEXT: index z1.d, #0, #1
-; CHECK-NEXT: mov z3.d, #0 // =0x0
-; CHECK-NEXT: punpkhi p1.h, p0.b
-; CHECK-NEXT: punpklo p2.h, p0.b
-; CHECK-NEXT: mov z2.d, z1.d
-; CHECK-NEXT: sel z1.d, p2, z1.d, z3.d
-; CHECK-NEXT: incd z2.d
-; CHECK-NEXT: sel z2.d, p1, z2.d, z3.d
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d
-; CHECK-NEXT: umaxv d1, p1, z1.d
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: whilels p1.s, xzr, x8
-; CHECK-NEXT: lastb w8, p1, z0.s
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: ptest p1, p0.b
-; CHECK-NEXT: csel w0, w8, w0, ne
-; CHECK-NEXT: ret
- %res = call i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru)
- ret i32 %res
-}
-
-define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) #0 {
-; CHECK-LABEL: extract_last_i64_scalable:
-; CHECK: // %bb.0:
-; CHECK-NEXT: index z1.d, #0, #1
-; CHECK-NEXT: mov z2.d, #0 // =0x0
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sel z1.d, p0, z1.d, z2.d
-; CHECK-NEXT: umaxv d1, p1, z1.d
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: whilels p2.d, xzr, x8
-; CHECK-NEXT: ptest p1, p0.b
-; CHECK-NEXT: lastb x8, p2, z0.d
-; CHECK-NEXT: csel x0, x8, x0, ne
-; CHECK-NEXT: ret
- %res = call i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru)
- ret i64 %res
-}
-
-declare i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
-declare i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
-declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
-declare i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64>, <2 x i1>, i64)
-declare i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
-declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
-declare i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
-declare i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
-
-attributes #0 = { "target-features"="+sve" vscale_range(1, 16) }
>From 49b9a7a916db0ebc8b58f2cd5442443996f80fc4 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 14 Nov 2024 13:44:45 +0000
Subject: [PATCH 3/3] Move lowering code to dedicated function
---
.../SelectionDAG/SelectionDAGBuilder.cpp | 81 ++++++++++---------
.../SelectionDAG/SelectionDAGBuilder.h | 1 +
2 files changed, 46 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 06755926841ac3..55cbd093ffd68e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6416,6 +6416,50 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
DAG.setRoot(Histogram);
}
+void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I,
+ unsigned Intrinsic) {
+ assert(Intrinsic == Intrinsic::experimental_vector_extract_last_active &&
+ "Tried lowering invalid vector extract last");
+ SDLoc sdl = getCurSDLoc();
+ SDValue Data = getValue(I.getOperand(0));
+ SDValue Mask = getValue(I.getOperand(1));
+ SDValue PassThru = getValue(I.getOperand(2));
+
+ EVT DataVT = Data.getValueType();
+ EVT ScalarVT = PassThru.getValueType();
+ EVT BoolVT = Mask.getValueType().getScalarType();
+
+ // Find a suitable type for a stepvector.
+ ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
+ if (DataVT.isScalableVector())
+ VScaleRange = getVScaleRange(I.getCaller(), 64);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned EltWidth = TLI.getBitWidthForCttzElements(
+ I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true,
+ &VScaleRange);
+ MVT StepVT = MVT::getIntegerVT(EltWidth);
+ EVT StepVecVT = DataVT.changeVectorElementType(StepVT);
+
+ // Zero out lanes with inactive elements, then find the highest remaining
+ // value from the stepvector.
+ SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT);
+ SDValue StepVec = DAG.getStepVector(sdl, StepVecVT);
+ SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes);
+ SDValue HighestIdx =
+ DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts);
+
+ // Extract the corresponding lane from the data vector
+ EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+ SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT);
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx);
+
+ // If all mask lanes were inactive, choose the passthru value instead.
+ SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
+ SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru);
+ setValue(&I, Result);
+}
+
/// Lower the call to the specified intrinsic function.
void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
unsigned Intrinsic) {
@@ -8208,42 +8252,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
}
case Intrinsic::experimental_vector_extract_last_active: {
- SDValue Data = getValue(I.getOperand(0));
- SDValue Mask = getValue(I.getOperand(1));
- SDValue PassThru = getValue(I.getOperand(2));
-
- EVT DataVT = Data.getValueType();
- EVT ScalarVT = PassThru.getValueType();
- EVT BoolVT = Mask.getValueType().getScalarType();
-
- // Find a suitable type for a stepvector.
- ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
- if (DataVT.isScalableVector())
- VScaleRange = getVScaleRange(I.getCaller(), 64);
- unsigned EltWidth = TLI.getBitWidthForCttzElements(
- I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true,
- &VScaleRange);
- MVT StepVT = MVT::getIntegerVT(EltWidth);
- EVT StepVecVT = DataVT.changeVectorElementType(StepVT);
-
- // Zero out lanes with inactive elements, then find the highest remaining
- // value from the stepvector.
- SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT);
- SDValue StepVec = DAG.getStepVector(sdl, StepVecVT);
- SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes);
- SDValue HighestIdx =
- DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts);
-
- // Extract the corresponding lane from the data vector
- EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout());
- SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT);
- SDValue Extract =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx);
-
- // If all mask lanes were inactive, choose the passthru value instead.
- SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
- SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru);
- setValue(&I, Result);
+ visitVectorExtractLastActive(I, Intrinsic);
return;
}
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 3f8a3e7ffb65bb..3a8dc25e98700e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -629,6 +629,7 @@ class SelectionDAGBuilder {
void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
void visitConvergenceControl(const CallInst &I, unsigned Intrinsic);
void visitVectorHistogram(const CallInst &I, unsigned IntrinsicID);
+ void visitVectorExtractLastActive(const CallInst &I, unsigned Intrinsic);
void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT,
const SmallVectorImpl<SDValue> &OpValues);
void visitVPStore(const VPIntrinsic &VPIntrin,
More information about the llvm-commits
mailing list