[llvm] Vector masked extract last active element intrinsic (PR #113587)

Graham Hunter via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 14 06:13:20 PST 2024


https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/113587

>From e4d20fbb85af1d3054905d700637e5e076e70d0a Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 23 Oct 2024 14:23:56 +0000
Subject: [PATCH 1/3] Initial working version via scalarization

---
 llvm/docs/LangRef.rst                         |  30 +
 llvm/include/llvm/IR/Intrinsics.td            |   6 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  23 +
 .../CodeGen/AArch64/vector-masked-extract.ll  | 663 ++++++++++++++++++
 4 files changed, 722 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/vector-masked-extract.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ef38c5ab33b926..ef2965a1a19610 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20002,6 +20002,36 @@ the follow sequence of operations:
 
 The ``mask`` operand will apply to at least the gather and scatter operations.
 
+'``llvm.experimental.vector.masked.extract.last.active``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This is an overloaded intrinsic.
+
+This intrinsic will extract the value from a single lane of a vector, based
+on a supplied mask vector.
+
+::
+
+    declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru)
+    declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
+
+Arguments:
+""""""""""
+
+The first argument is the data vector to extract a lane from. The second is a
+mask vector controlling the extraction. The third argument is a passthru
+value.
+
+The two input vectors must have the same number of elements, and the type of
+the passthru value must match that of the elements of the data vector.
+
+Semantics:
+""""""""""
+
+The '``llvm.experimental.vector.masked.extract.last.active``' intrinsic will
+find the index of the most significant active lane in the mask vector, and
+extract the element at that index in the corresponding data vector. If no mask
+lanes are active then the passthru value is returned instead.
 
 .. _int_vector_compress:
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 8ed57f818d6006..bd2edd9f950369 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1920,6 +1920,12 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
                              [ IntrArgMemOnly ]>;
 
+// Extract based on mask bits
+def int_experimental_vector_masked_extract_last_active:
+    DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+              [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+               LLVMVectorElementType<0>], [IntrNoMem]>;
+
 // Operators
 let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
   // Integer arithmetic
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3b046aa25f5444..ea0b0330e981b1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8207,6 +8207,29 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     visitVectorHistogram(I, Intrinsic);
     return;
   }
+  case Intrinsic::experimental_vector_masked_extract_last_active: {
+    SDValue Data = getValue(I.getOperand(0));
+    SDValue Mask = getValue(I.getOperand(1));
+    SDValue PassThru = getValue(I.getOperand(2));
+
+    EVT DataVT = Data.getValueType();
+    EVT ScalarVT = PassThru.getValueType();
+    EVT BoolVT = Mask.getValueType().getScalarType();
+    EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+    EVT IdxVecVT = DataVT.changeVectorElementType(IdxVT);
+
+    SDValue Zeroes = DAG.getConstant(0, sdl, IdxVecVT);
+    SDValue StepVec = DAG.getStepVector(sdl, IdxVecVT);
+    SDValue ActiveElts = DAG.getSelect(sdl, IdxVecVT, Mask, StepVec, Zeroes);
+    SDValue HighestIdx =
+        DAG.getNode(ISD::VECREDUCE_UMAX, sdl, IdxVT, ActiveElts);
+    SDValue Extract =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, HighestIdx);
+    SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
+    SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru);
+    setValue(&I, Result);
+    return;
+  }
   }
 }
 
diff --git a/llvm/test/CodeGen/AArch64/vector-masked-extract.ll b/llvm/test/CodeGen/AArch64/vector-masked-extract.ll
new file mode 100644
index 00000000000000..04adf4e476b041
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-masked-extract.ll
@@ -0,0 +1,663 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,NEON-FIXED
+; RUN: llc -mtriple=aarch64 -mattr=+sve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SVE-FIXED
+
+define i8 @extract_last_i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i8:
+; NEON-FIXED:       // %bb.0:
+; NEON-FIXED-NEXT:    sub sp, sp, #16
+; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT:    umov w15, v1.b[14]
+; NEON-FIXED-NEXT:    umov w14, v1.b[6]
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_0
+; NEON-FIXED-NEXT:    umov w12, v1.b[15]
+; NEON-FIXED-NEXT:    umov w13, v1.b[10]
+; NEON-FIXED-NEXT:    ldr q2, [x8, :lo12:.LCPI0_0]
+; NEON-FIXED-NEXT:    umov w11, v1.b[2]
+; NEON-FIXED-NEXT:    umov w8, v1.b[7]
+; NEON-FIXED-NEXT:    str q0, [sp]
+; NEON-FIXED-NEXT:    umov w9, v1.b[11]
+; NEON-FIXED-NEXT:    umov w10, v1.b[3]
+; NEON-FIXED-NEXT:    umov w16, v1.b[12]
+; NEON-FIXED-NEXT:    fmov s3, w15
+; NEON-FIXED-NEXT:    umov w15, v1.b[4]
+; NEON-FIXED-NEXT:    fmov s4, w14
+; NEON-FIXED-NEXT:    fmov s5, w13
+; NEON-FIXED-NEXT:    umov w13, v1.b[0]
+; NEON-FIXED-NEXT:    umov w14, v1.b[13]
+; NEON-FIXED-NEXT:    fmov s6, w11
+; NEON-FIXED-NEXT:    umov w11, v1.b[5]
+; NEON-FIXED-NEXT:    mov v3.s[1], w12
+; NEON-FIXED-NEXT:    umov w12, v1.b[8]
+; NEON-FIXED-NEXT:    mov v4.s[1], w8
+; NEON-FIXED-NEXT:    umov w8, v1.b[9]
+; NEON-FIXED-NEXT:    mov v5.s[1], w9
+; NEON-FIXED-NEXT:    umov w9, v1.b[1]
+; NEON-FIXED-NEXT:    fmov s7, w16
+; NEON-FIXED-NEXT:    fmov s16, w15
+; NEON-FIXED-NEXT:    mov v6.s[1], w10
+; NEON-FIXED-NEXT:    fmov s18, w13
+; NEON-FIXED-NEXT:    shl v1.16b, v1.16b, #7
+; NEON-FIXED-NEXT:    fmov s17, w12
+; NEON-FIXED-NEXT:    ushll v3.2d, v3.2s, #0
+; NEON-FIXED-NEXT:    ushll v4.2d, v4.2s, #0
+; NEON-FIXED-NEXT:    mov v7.s[1], w14
+; NEON-FIXED-NEXT:    mov v16.s[1], w11
+; NEON-FIXED-NEXT:    ushll v5.2d, v5.2s, #0
+; NEON-FIXED-NEXT:    mov v18.s[1], w9
+; NEON-FIXED-NEXT:    adrp x9, .LCPI0_2
+; NEON-FIXED-NEXT:    ushll v6.2d, v6.2s, #0
+; NEON-FIXED-NEXT:    ldr q20, [x9, :lo12:.LCPI0_2]
+; NEON-FIXED-NEXT:    adrp x9, .LCPI0_7
+; NEON-FIXED-NEXT:    mov v17.s[1], w8
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_1
+; NEON-FIXED-NEXT:    ldr q23, [x9, :lo12:.LCPI0_7]
+; NEON-FIXED-NEXT:    mov x9, sp
+; NEON-FIXED-NEXT:    ldr q19, [x8, :lo12:.LCPI0_1]
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_3
+; NEON-FIXED-NEXT:    shl v3.2d, v3.2d, #63
+; NEON-FIXED-NEXT:    shl v4.2d, v4.2d, #63
+; NEON-FIXED-NEXT:    ushll v7.2d, v7.2s, #0
+; NEON-FIXED-NEXT:    shl v5.2d, v5.2d, #63
+; NEON-FIXED-NEXT:    ushll v16.2d, v16.2s, #0
+; NEON-FIXED-NEXT:    ushll v17.2d, v17.2s, #0
+; NEON-FIXED-NEXT:    shl v6.2d, v6.2d, #63
+; NEON-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT:    ushll v18.2d, v18.2s, #0
+; NEON-FIXED-NEXT:    cmlt v1.16b, v1.16b, #0
+; NEON-FIXED-NEXT:    cmlt v4.2d, v4.2d, #0
+; NEON-FIXED-NEXT:    cmlt v5.2d, v5.2d, #0
+; NEON-FIXED-NEXT:    cmlt v6.2d, v6.2d, #0
+; NEON-FIXED-NEXT:    and v2.16b, v3.16b, v2.16b
+; NEON-FIXED-NEXT:    shl v3.2d, v7.2d, #63
+; NEON-FIXED-NEXT:    shl v7.2d, v16.2d, #63
+; NEON-FIXED-NEXT:    shl v16.2d, v17.2d, #63
+; NEON-FIXED-NEXT:    ldr q17, [x8, :lo12:.LCPI0_3]
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_4
+; NEON-FIXED-NEXT:    ldr q21, [x8, :lo12:.LCPI0_4]
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_5
+; NEON-FIXED-NEXT:    shl v18.2d, v18.2d, #63
+; NEON-FIXED-NEXT:    ldr q22, [x8, :lo12:.LCPI0_5]
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_6
+; NEON-FIXED-NEXT:    and v4.16b, v4.16b, v19.16b
+; NEON-FIXED-NEXT:    ldr q19, [x8, :lo12:.LCPI0_6]
+; NEON-FIXED-NEXT:    cmlt v16.2d, v16.2d, #0
+; NEON-FIXED-NEXT:    and v5.16b, v5.16b, v20.16b
+; NEON-FIXED-NEXT:    cmlt v18.2d, v18.2d, #0
+; NEON-FIXED-NEXT:    and v6.16b, v6.16b, v17.16b
+; NEON-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT:    cmlt v7.2d, v7.2d, #0
+; NEON-FIXED-NEXT:    umaxv b1, v1.16b
+; NEON-FIXED-NEXT:    and v16.16b, v16.16b, v19.16b
+; NEON-FIXED-NEXT:    and v17.16b, v18.16b, v23.16b
+; NEON-FIXED-NEXT:    cmhi v18.2d, v4.2d, v2.2d
+; NEON-FIXED-NEXT:    cmhi v19.2d, v6.2d, v5.2d
+; NEON-FIXED-NEXT:    and v3.16b, v3.16b, v21.16b
+; NEON-FIXED-NEXT:    and v7.16b, v7.16b, v22.16b
+; NEON-FIXED-NEXT:    cmhi v21.2d, v17.2d, v16.2d
+; NEON-FIXED-NEXT:    bit v2.16b, v4.16b, v18.16b
+; NEON-FIXED-NEXT:    mov v4.16b, v19.16b
+; NEON-FIXED-NEXT:    cmhi v20.2d, v7.2d, v3.2d
+; NEON-FIXED-NEXT:    bsl v4.16b, v6.16b, v5.16b
+; NEON-FIXED-NEXT:    mov v5.16b, v21.16b
+; NEON-FIXED-NEXT:    bit v3.16b, v7.16b, v20.16b
+; NEON-FIXED-NEXT:    bsl v5.16b, v17.16b, v16.16b
+; NEON-FIXED-NEXT:    cmhi v6.2d, v4.2d, v2.2d
+; NEON-FIXED-NEXT:    cmhi v7.2d, v5.2d, v3.2d
+; NEON-FIXED-NEXT:    bit v2.16b, v4.16b, v6.16b
+; NEON-FIXED-NEXT:    bit v3.16b, v5.16b, v7.16b
+; NEON-FIXED-NEXT:    cmhi v4.2d, v3.2d, v2.2d
+; NEON-FIXED-NEXT:    bit v2.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; NEON-FIXED-NEXT:    cmhi d4, d2, d3
+; NEON-FIXED-NEXT:    bif v2.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT:    fmov x8, d2
+; NEON-FIXED-NEXT:    bfxil x9, x8, #0, #4
+; NEON-FIXED-NEXT:    ldrb w8, [x9]
+; NEON-FIXED-NEXT:    fmov w9, s1
+; NEON-FIXED-NEXT:    tst w9, #0x1
+; NEON-FIXED-NEXT:    csel w0, w8, w0, ne
+; NEON-FIXED-NEXT:    add sp, sp, #16
+; NEON-FIXED-NEXT:    ret
+;
+; SVE-FIXED-LABEL: extract_last_i8:
+; SVE-FIXED:       // %bb.0:
+; SVE-FIXED-NEXT:    sub sp, sp, #16
+; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT:    umov w8, v1.b[14]
+; SVE-FIXED-NEXT:    umov w9, v1.b[6]
+; SVE-FIXED-NEXT:    index z2.d, #0, #1
+; SVE-FIXED-NEXT:    umov w12, v1.b[2]
+; SVE-FIXED-NEXT:    umov w10, v1.b[10]
+; SVE-FIXED-NEXT:    str q0, [sp]
+; SVE-FIXED-NEXT:    umov w13, v1.b[12]
+; SVE-FIXED-NEXT:    umov w11, v1.b[15]
+; SVE-FIXED-NEXT:    umov w14, v1.b[4]
+; SVE-FIXED-NEXT:    umov w16, v1.b[0]
+; SVE-FIXED-NEXT:    umov w15, v1.b[8]
+; SVE-FIXED-NEXT:    fmov s3, w8
+; SVE-FIXED-NEXT:    umov w8, v1.b[7]
+; SVE-FIXED-NEXT:    fmov s4, w9
+; SVE-FIXED-NEXT:    umov w9, v1.b[11]
+; SVE-FIXED-NEXT:    fmov s6, w12
+; SVE-FIXED-NEXT:    umov w12, v1.b[3]
+; SVE-FIXED-NEXT:    fmov s5, w10
+; SVE-FIXED-NEXT:    umov w10, v1.b[1]
+; SVE-FIXED-NEXT:    fmov s7, w13
+; SVE-FIXED-NEXT:    umov w13, v1.b[13]
+; SVE-FIXED-NEXT:    fmov s16, w14
+; SVE-FIXED-NEXT:    fmov s18, w16
+; SVE-FIXED-NEXT:    mov v4.s[1], w8
+; SVE-FIXED-NEXT:    umov w8, v1.b[5]
+; SVE-FIXED-NEXT:    mov v3.s[1], w11
+; SVE-FIXED-NEXT:    mov v5.s[1], w9
+; SVE-FIXED-NEXT:    mov v6.s[1], w12
+; SVE-FIXED-NEXT:    umov w9, v1.b[9]
+; SVE-FIXED-NEXT:    fmov s17, w15
+; SVE-FIXED-NEXT:    mov v18.s[1], w10
+; SVE-FIXED-NEXT:    mov z19.d, z2.d
+; SVE-FIXED-NEXT:    mov v7.s[1], w13
+; SVE-FIXED-NEXT:    mov z20.d, z2.d
+; SVE-FIXED-NEXT:    mov z21.d, z2.d
+; SVE-FIXED-NEXT:    mov v16.s[1], w8
+; SVE-FIXED-NEXT:    ushll v3.2d, v3.2s, #0
+; SVE-FIXED-NEXT:    ushll v4.2d, v4.2s, #0
+; SVE-FIXED-NEXT:    ushll v5.2d, v5.2s, #0
+; SVE-FIXED-NEXT:    ushll v6.2d, v6.2s, #0
+; SVE-FIXED-NEXT:    mov v17.s[1], w9
+; SVE-FIXED-NEXT:    mov x9, sp
+; SVE-FIXED-NEXT:    ushll v18.2d, v18.2s, #0
+; SVE-FIXED-NEXT:    mov z25.d, z2.d
+; SVE-FIXED-NEXT:    ushll v7.2d, v7.2s, #0
+; SVE-FIXED-NEXT:    shl v3.2d, v3.2d, #63
+; SVE-FIXED-NEXT:    shl v4.2d, v4.2d, #63
+; SVE-FIXED-NEXT:    ushll v16.2d, v16.2s, #0
+; SVE-FIXED-NEXT:    shl v5.2d, v5.2d, #63
+; SVE-FIXED-NEXT:    shl v6.2d, v6.2d, #63
+; SVE-FIXED-NEXT:    mov z22.d, z2.d
+; SVE-FIXED-NEXT:    mov z23.d, z2.d
+; SVE-FIXED-NEXT:    add z19.d, z19.d, #6 // =0x6
+; SVE-FIXED-NEXT:    shl v18.2d, v18.2d, #63
+; SVE-FIXED-NEXT:    ushll v17.2d, v17.2s, #0
+; SVE-FIXED-NEXT:    shl v7.2d, v7.2d, #63
+; SVE-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
+; SVE-FIXED-NEXT:    cmlt v4.2d, v4.2d, #0
+; SVE-FIXED-NEXT:    add z25.d, z25.d, #14 // =0xe
+; SVE-FIXED-NEXT:    shl v16.2d, v16.2d, #63
+; SVE-FIXED-NEXT:    cmlt v5.2d, v5.2d, #0
+; SVE-FIXED-NEXT:    add z20.d, z20.d, #10 // =0xa
+; SVE-FIXED-NEXT:    cmlt v6.2d, v6.2d, #0
+; SVE-FIXED-NEXT:    add z21.d, z21.d, #2 // =0x2
+; SVE-FIXED-NEXT:    mov z24.d, z2.d
+; SVE-FIXED-NEXT:    shl v17.2d, v17.2d, #63
+; SVE-FIXED-NEXT:    cmlt v18.2d, v18.2d, #0
+; SVE-FIXED-NEXT:    cmlt v7.2d, v7.2d, #0
+; SVE-FIXED-NEXT:    add z22.d, z22.d, #12 // =0xc
+; SVE-FIXED-NEXT:    cmlt v16.2d, v16.2d, #0
+; SVE-FIXED-NEXT:    add z23.d, z23.d, #4 // =0x4
+; SVE-FIXED-NEXT:    and v3.16b, v3.16b, v25.16b
+; SVE-FIXED-NEXT:    and v4.16b, v4.16b, v19.16b
+; SVE-FIXED-NEXT:    and v5.16b, v5.16b, v20.16b
+; SVE-FIXED-NEXT:    and v6.16b, v6.16b, v21.16b
+; SVE-FIXED-NEXT:    cmlt v17.2d, v17.2d, #0
+; SVE-FIXED-NEXT:    add z24.d, z24.d, #8 // =0x8
+; SVE-FIXED-NEXT:    and v2.16b, v18.16b, v2.16b
+; SVE-FIXED-NEXT:    and v7.16b, v7.16b, v22.16b
+; SVE-FIXED-NEXT:    and v16.16b, v16.16b, v23.16b
+; SVE-FIXED-NEXT:    cmhi v18.2d, v4.2d, v3.2d
+; SVE-FIXED-NEXT:    shl v1.16b, v1.16b, #7
+; SVE-FIXED-NEXT:    cmhi v19.2d, v6.2d, v5.2d
+; SVE-FIXED-NEXT:    and v17.16b, v17.16b, v24.16b
+; SVE-FIXED-NEXT:    cmhi v20.2d, v16.2d, v7.2d
+; SVE-FIXED-NEXT:    bit v3.16b, v4.16b, v18.16b
+; SVE-FIXED-NEXT:    cmlt v1.16b, v1.16b, #0
+; SVE-FIXED-NEXT:    mov v4.16b, v19.16b
+; SVE-FIXED-NEXT:    cmhi v21.2d, v2.2d, v17.2d
+; SVE-FIXED-NEXT:    umaxv b1, v1.16b
+; SVE-FIXED-NEXT:    bsl v4.16b, v6.16b, v5.16b
+; SVE-FIXED-NEXT:    mov v5.16b, v20.16b
+; SVE-FIXED-NEXT:    bif v2.16b, v17.16b, v21.16b
+; SVE-FIXED-NEXT:    bsl v5.16b, v16.16b, v7.16b
+; SVE-FIXED-NEXT:    cmhi v6.2d, v4.2d, v3.2d
+; SVE-FIXED-NEXT:    cmhi v7.2d, v2.2d, v5.2d
+; SVE-FIXED-NEXT:    bit v3.16b, v4.16b, v6.16b
+; SVE-FIXED-NEXT:    bif v2.16b, v5.16b, v7.16b
+; SVE-FIXED-NEXT:    cmhi v4.2d, v2.2d, v3.2d
+; SVE-FIXED-NEXT:    bif v2.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT:    cmhi d4, d2, d3
+; SVE-FIXED-NEXT:    bif v2.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT:    fmov x8, d2
+; SVE-FIXED-NEXT:    bfxil x9, x8, #0, #4
+; SVE-FIXED-NEXT:    ldrb w8, [x9]
+; SVE-FIXED-NEXT:    fmov w9, s1
+; SVE-FIXED-NEXT:    tst w9, #0x1
+; SVE-FIXED-NEXT:    csel w0, w8, w0, ne
+; SVE-FIXED-NEXT:    add sp, sp, #16
+; SVE-FIXED-NEXT:    ret
+  %res = call i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru)
+  ret i8 %res
+}
+
+define i16 @extract_last_i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i16:
+; NEON-FIXED:       // %bb.0:
+; NEON-FIXED-NEXT:    sub sp, sp, #16
+; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NEON-FIXED-NEXT:    umov w8, v1.b[6]
+; NEON-FIXED-NEXT:    umov w9, v1.b[2]
+; NEON-FIXED-NEXT:    str q0, [sp]
+; NEON-FIXED-NEXT:    umov w11, v1.b[4]
+; NEON-FIXED-NEXT:    umov w12, v1.b[0]
+; NEON-FIXED-NEXT:    umov w10, v1.b[7]
+; NEON-FIXED-NEXT:    umov w13, v1.b[3]
+; NEON-FIXED-NEXT:    umov w14, v1.b[5]
+; NEON-FIXED-NEXT:    umov w15, v1.b[1]
+; NEON-FIXED-NEXT:    shl v1.8b, v1.8b, #7
+; NEON-FIXED-NEXT:    fmov s2, w8
+; NEON-FIXED-NEXT:    adrp x8, .LCPI1_0
+; NEON-FIXED-NEXT:    fmov s3, w9
+; NEON-FIXED-NEXT:    fmov s4, w11
+; NEON-FIXED-NEXT:    adrp x9, .LCPI1_1
+; NEON-FIXED-NEXT:    ldr q6, [x8, :lo12:.LCPI1_0]
+; NEON-FIXED-NEXT:    fmov s5, w12
+; NEON-FIXED-NEXT:    adrp x8, .LCPI1_3
+; NEON-FIXED-NEXT:    ldr q7, [x9, :lo12:.LCPI1_1]
+; NEON-FIXED-NEXT:    mov v2.s[1], w10
+; NEON-FIXED-NEXT:    mov v3.s[1], w13
+; NEON-FIXED-NEXT:    adrp x10, .LCPI1_2
+; NEON-FIXED-NEXT:    mov v4.s[1], w14
+; NEON-FIXED-NEXT:    ldr q16, [x10, :lo12:.LCPI1_2]
+; NEON-FIXED-NEXT:    ldr q17, [x8, :lo12:.LCPI1_3]
+; NEON-FIXED-NEXT:    mov v5.s[1], w15
+; NEON-FIXED-NEXT:    cmlt v1.8b, v1.8b, #0
+; NEON-FIXED-NEXT:    mov x9, sp
+; NEON-FIXED-NEXT:    ushll v2.2d, v2.2s, #0
+; NEON-FIXED-NEXT:    ushll v3.2d, v3.2s, #0
+; NEON-FIXED-NEXT:    ushll v4.2d, v4.2s, #0
+; NEON-FIXED-NEXT:    umaxv b1, v1.8b
+; NEON-FIXED-NEXT:    ushll v5.2d, v5.2s, #0
+; NEON-FIXED-NEXT:    shl v2.2d, v2.2d, #63
+; NEON-FIXED-NEXT:    shl v3.2d, v3.2d, #63
+; NEON-FIXED-NEXT:    shl v4.2d, v4.2d, #63
+; NEON-FIXED-NEXT:    shl v5.2d, v5.2d, #63
+; NEON-FIXED-NEXT:    cmlt v2.2d, v2.2d, #0
+; NEON-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT:    cmlt v4.2d, v4.2d, #0
+; NEON-FIXED-NEXT:    cmlt v5.2d, v5.2d, #0
+; NEON-FIXED-NEXT:    and v2.16b, v2.16b, v6.16b
+; NEON-FIXED-NEXT:    and v3.16b, v3.16b, v7.16b
+; NEON-FIXED-NEXT:    and v4.16b, v4.16b, v16.16b
+; NEON-FIXED-NEXT:    and v5.16b, v5.16b, v17.16b
+; NEON-FIXED-NEXT:    cmhi v6.2d, v3.2d, v2.2d
+; NEON-FIXED-NEXT:    cmhi v7.2d, v5.2d, v4.2d
+; NEON-FIXED-NEXT:    bit v2.16b, v3.16b, v6.16b
+; NEON-FIXED-NEXT:    mov v3.16b, v7.16b
+; NEON-FIXED-NEXT:    bsl v3.16b, v5.16b, v4.16b
+; NEON-FIXED-NEXT:    cmhi v4.2d, v3.2d, v2.2d
+; NEON-FIXED-NEXT:    bit v2.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; NEON-FIXED-NEXT:    cmhi d4, d2, d3
+; NEON-FIXED-NEXT:    bif v2.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT:    fmov x8, d2
+; NEON-FIXED-NEXT:    bfi x9, x8, #1, #3
+; NEON-FIXED-NEXT:    ldrh w8, [x9]
+; NEON-FIXED-NEXT:    fmov w9, s1
+; NEON-FIXED-NEXT:    tst w9, #0x1
+; NEON-FIXED-NEXT:    csel w0, w8, w0, ne
+; NEON-FIXED-NEXT:    add sp, sp, #16
+; NEON-FIXED-NEXT:    ret
+;
+; SVE-FIXED-LABEL: extract_last_i16:
+; SVE-FIXED:       // %bb.0:
+; SVE-FIXED-NEXT:    sub sp, sp, #16
+; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT:    // kill: def $d1 killed $d1 def $q1
+; SVE-FIXED-NEXT:    umov w8, v1.b[0]
+; SVE-FIXED-NEXT:    umov w10, v1.b[6]
+; SVE-FIXED-NEXT:    index z6.d, #0, #1
+; SVE-FIXED-NEXT:    umov w11, v1.b[2]
+; SVE-FIXED-NEXT:    umov w14, v1.b[4]
+; SVE-FIXED-NEXT:    str q0, [sp]
+; SVE-FIXED-NEXT:    umov w9, v1.b[1]
+; SVE-FIXED-NEXT:    umov w12, v1.b[7]
+; SVE-FIXED-NEXT:    umov w13, v1.b[3]
+; SVE-FIXED-NEXT:    fmov s2, w8
+; SVE-FIXED-NEXT:    umov w8, v1.b[5]
+; SVE-FIXED-NEXT:    fmov s3, w10
+; SVE-FIXED-NEXT:    fmov s4, w11
+; SVE-FIXED-NEXT:    fmov s5, w14
+; SVE-FIXED-NEXT:    mov z7.d, z6.d
+; SVE-FIXED-NEXT:    mov z16.d, z6.d
+; SVE-FIXED-NEXT:    mov z17.d, z6.d
+; SVE-FIXED-NEXT:    shl v1.8b, v1.8b, #7
+; SVE-FIXED-NEXT:    mov v2.s[1], w9
+; SVE-FIXED-NEXT:    mov x9, sp
+; SVE-FIXED-NEXT:    mov v3.s[1], w12
+; SVE-FIXED-NEXT:    mov v4.s[1], w13
+; SVE-FIXED-NEXT:    mov v5.s[1], w8
+; SVE-FIXED-NEXT:    add z7.d, z7.d, #2 // =0x2
+; SVE-FIXED-NEXT:    add z17.d, z17.d, #6 // =0x6
+; SVE-FIXED-NEXT:    add z16.d, z16.d, #4 // =0x4
+; SVE-FIXED-NEXT:    cmlt v1.8b, v1.8b, #0
+; SVE-FIXED-NEXT:    ushll v2.2d, v2.2s, #0
+; SVE-FIXED-NEXT:    ushll v3.2d, v3.2s, #0
+; SVE-FIXED-NEXT:    ushll v4.2d, v4.2s, #0
+; SVE-FIXED-NEXT:    ushll v5.2d, v5.2s, #0
+; SVE-FIXED-NEXT:    umaxv b1, v1.8b
+; SVE-FIXED-NEXT:    shl v2.2d, v2.2d, #63
+; SVE-FIXED-NEXT:    shl v3.2d, v3.2d, #63
+; SVE-FIXED-NEXT:    shl v4.2d, v4.2d, #63
+; SVE-FIXED-NEXT:    shl v5.2d, v5.2d, #63
+; SVE-FIXED-NEXT:    cmlt v2.2d, v2.2d, #0
+; SVE-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
+; SVE-FIXED-NEXT:    cmlt v4.2d, v4.2d, #0
+; SVE-FIXED-NEXT:    cmlt v5.2d, v5.2d, #0
+; SVE-FIXED-NEXT:    and v2.16b, v2.16b, v6.16b
+; SVE-FIXED-NEXT:    and v3.16b, v3.16b, v17.16b
+; SVE-FIXED-NEXT:    and v4.16b, v4.16b, v7.16b
+; SVE-FIXED-NEXT:    and v5.16b, v5.16b, v16.16b
+; SVE-FIXED-NEXT:    cmhi v6.2d, v4.2d, v3.2d
+; SVE-FIXED-NEXT:    cmhi v7.2d, v2.2d, v5.2d
+; SVE-FIXED-NEXT:    bit v3.16b, v4.16b, v6.16b
+; SVE-FIXED-NEXT:    bif v2.16b, v5.16b, v7.16b
+; SVE-FIXED-NEXT:    cmhi v4.2d, v2.2d, v3.2d
+; SVE-FIXED-NEXT:    bif v2.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT:    cmhi d4, d2, d3
+; SVE-FIXED-NEXT:    bif v2.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT:    fmov x8, d2
+; SVE-FIXED-NEXT:    bfi x9, x8, #1, #3
+; SVE-FIXED-NEXT:    ldrh w8, [x9]
+; SVE-FIXED-NEXT:    fmov w9, s1
+; SVE-FIXED-NEXT:    tst w9, #0x1
+; SVE-FIXED-NEXT:    csel w0, w8, w0, ne
+; SVE-FIXED-NEXT:    add sp, sp, #16
+; SVE-FIXED-NEXT:    ret
+  %res = call i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru)
+  ret i16 %res
+}
+
+define i32 @extract_last_i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i32:
+; NEON-FIXED:       // %bb.0:
+; NEON-FIXED-NEXT:    sub sp, sp, #16
+; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT:    ushll v2.4s, v1.4h, #0
+; NEON-FIXED-NEXT:    adrp x8, .LCPI2_0
+; NEON-FIXED-NEXT:    adrp x9, .LCPI2_1
+; NEON-FIXED-NEXT:    ldr q4, [x8, :lo12:.LCPI2_0]
+; NEON-FIXED-NEXT:    ldr q5, [x9, :lo12:.LCPI2_1]
+; NEON-FIXED-NEXT:    shl v1.4h, v1.4h, #15
+; NEON-FIXED-NEXT:    mov x9, sp
+; NEON-FIXED-NEXT:    str q0, [sp]
+; NEON-FIXED-NEXT:    ushll2 v3.2d, v2.4s, #0
+; NEON-FIXED-NEXT:    ushll v2.2d, v2.2s, #0
+; NEON-FIXED-NEXT:    cmlt v1.4h, v1.4h, #0
+; NEON-FIXED-NEXT:    shl v3.2d, v3.2d, #63
+; NEON-FIXED-NEXT:    shl v2.2d, v2.2d, #63
+; NEON-FIXED-NEXT:    umaxv h1, v1.4h
+; NEON-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT:    cmlt v2.2d, v2.2d, #0
+; NEON-FIXED-NEXT:    and v3.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT:    and v2.16b, v2.16b, v5.16b
+; NEON-FIXED-NEXT:    cmhi v4.2d, v2.2d, v3.2d
+; NEON-FIXED-NEXT:    bif v2.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT:    bic v3.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; NEON-FIXED-NEXT:    cmhi d4, d3, d2
+; NEON-FIXED-NEXT:    bit v2.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT:    fmov x8, d2
+; NEON-FIXED-NEXT:    bfi x9, x8, #2, #2
+; NEON-FIXED-NEXT:    ldr w8, [x9]
+; NEON-FIXED-NEXT:    fmov w9, s1
+; NEON-FIXED-NEXT:    tst w9, #0x1
+; NEON-FIXED-NEXT:    csel w0, w8, w0, ne
+; NEON-FIXED-NEXT:    add sp, sp, #16
+; NEON-FIXED-NEXT:    ret
+;
+; SVE-FIXED-LABEL: extract_last_i32:
+; SVE-FIXED:       // %bb.0:
+; SVE-FIXED-NEXT:    sub sp, sp, #16
+; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT:    ushll v2.4s, v1.4h, #0
+; SVE-FIXED-NEXT:    index z4.d, #0, #1
+; SVE-FIXED-NEXT:    shl v1.4h, v1.4h, #15
+; SVE-FIXED-NEXT:    mov x9, sp
+; SVE-FIXED-NEXT:    str q0, [sp]
+; SVE-FIXED-NEXT:    ushll2 v3.2d, v2.4s, #0
+; SVE-FIXED-NEXT:    ushll v2.2d, v2.2s, #0
+; SVE-FIXED-NEXT:    cmlt v1.4h, v1.4h, #0
+; SVE-FIXED-NEXT:    mov z5.d, z4.d
+; SVE-FIXED-NEXT:    shl v3.2d, v3.2d, #63
+; SVE-FIXED-NEXT:    shl v2.2d, v2.2d, #63
+; SVE-FIXED-NEXT:    umaxv h1, v1.4h
+; SVE-FIXED-NEXT:    add z5.d, z5.d, #2 // =0x2
+; SVE-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
+; SVE-FIXED-NEXT:    cmlt v2.2d, v2.2d, #0
+; SVE-FIXED-NEXT:    and v2.16b, v2.16b, v4.16b
+; SVE-FIXED-NEXT:    and v3.16b, v3.16b, v5.16b
+; SVE-FIXED-NEXT:    cmhi v4.2d, v2.2d, v3.2d
+; SVE-FIXED-NEXT:    bif v2.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT:    bic v3.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT:    cmhi d4, d3, d2
+; SVE-FIXED-NEXT:    bit v2.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT:    fmov x8, d2
+; SVE-FIXED-NEXT:    bfi x9, x8, #2, #2
+; SVE-FIXED-NEXT:    ldr w8, [x9]
+; SVE-FIXED-NEXT:    fmov w9, s1
+; SVE-FIXED-NEXT:    tst w9, #0x1
+; SVE-FIXED-NEXT:    csel w0, w8, w0, ne
+; SVE-FIXED-NEXT:    add sp, sp, #16
+; SVE-FIXED-NEXT:    ret
+  %res = call i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru)
+  ret i32 %res
+}
+
+define i64 @extract_last_i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru) {
+; CHECK-LABEL: extract_last_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    fmov d2, xzr
+; CHECK-NEXT:    fmov d4, x8
+; CHECK-NEXT:    shl v1.2s, v1.2s, #31
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    shl v3.2d, v3.2d, #63
+; CHECK-NEXT:    cmlt v1.2s, v1.2s, #0
+; CHECK-NEXT:    cmlt v3.2d, v3.2d, #0
+; CHECK-NEXT:    umaxp v1.2s, v1.2s, v1.2s
+; CHECK-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT:    and v3.8b, v3.8b, v4.8b
+; CHECK-NEXT:    cmhi d2, d2, d3
+; CHECK-NEXT:    bic v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    orr x8, x9, x8, lsl #3
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    ldr x8, [x8]
+; CHECK-NEXT:    tst w9, #0x1
+; CHECK-NEXT:    csel x0, x8, x0, ne
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru)
+  ret i64 %res
+}
+
+define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) #0 {
+; CHECK-LABEL: extract_last_i8_scalable:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    mov z3.d, #0 // =0x0
+; CHECK-NEXT:    punpkhi p4.h, p0.b
+; CHECK-NEXT:    punpklo p5.h, p2.b
+; CHECK-NEXT:    punpkhi p1.h, p4.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z5.d, z1.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    punpkhi p3.h, p2.b
+; CHECK-NEXT:    punpklo p2.h, p4.b
+; CHECK-NEXT:    incd z2.d
+; CHECK-NEXT:    incd z5.d, all, mul #2
+; CHECK-NEXT:    punpklo p4.h, p5.b
+; CHECK-NEXT:    incd z6.d, all, mul #4
+; CHECK-NEXT:    punpkhi p6.h, p1.b
+; CHECK-NEXT:    punpkhi p7.h, p3.b
+; CHECK-NEXT:    sel z1.d, p4, z1.d, z3.d
+; CHECK-NEXT:    mov z4.d, z2.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov z25.d, z5.d
+; CHECK-NEXT:    punpkhi p5.h, p5.b
+; CHECK-NEXT:    punpkhi p4.h, p2.b
+; CHECK-NEXT:    incd z4.d, all, mul #2
+; CHECK-NEXT:    incd z25.d, all, mul #4
+; CHECK-NEXT:    incd z7.d, all, mul #4
+; CHECK-NEXT:    punpklo p3.h, p3.b
+; CHECK-NEXT:    sel z2.d, p5, z2.d, z3.d
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    punpklo p2.h, p2.b
+; CHECK-NEXT:    mov z24.d, z4.d
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    sel z5.d, p3, z5.d, z3.d
+; CHECK-NEXT:    sel z4.d, p7, z4.d, z3.d
+; CHECK-NEXT:    sel z6.d, p2, z6.d, z3.d
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z25.d, p1, z25.d, z3.d
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    incd z24.d, all, mul #4
+; CHECK-NEXT:    umax z1.d, p1/m, z1.d, z6.d
+; CHECK-NEXT:    sel z24.d, p6, z24.d, z3.d
+; CHECK-NEXT:    mov z3.d, p4/m, z7.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    umax z4.d, p1/m, z4.d, z24.d
+; CHECK-NEXT:    umax z2.d, p1/m, z2.d, z3.d
+; CHECK-NEXT:    movprfx z3, z5
+; CHECK-NEXT:    umax z3.d, p1/m, z3.d, z25.d
+; CHECK-NEXT:    umax z2.d, p1/m, z2.d, z4.d
+; CHECK-NEXT:    umax z1.d, p1/m, z1.d, z3.d
+; CHECK-NEXT:    umax z1.d, p1/m, z1.d, z2.d
+; CHECK-NEXT:    umaxv d1, p1, z1.d
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    whilels p1.b, xzr, x8
+; CHECK-NEXT:    ptest p0, p0.b
+; CHECK-NEXT:    lastb w8, p1, z0.b
+; CHECK-NEXT:    csel w0, w8, w0, ne
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru)
+  ret i8 %res
+}
+
+define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) #0 {
+; CHECK-LABEL: extract_last_i16_scalable:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    mov z5.d, #0 // =0x0
+; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    punpkhi p3.h, p1.b
+; CHECK-NEXT:    punpkhi p4.h, p2.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    punpklo p2.h, p2.b
+; CHECK-NEXT:    incd z2.d
+; CHECK-NEXT:    incd z3.d, all, mul #2
+; CHECK-NEXT:    sel z1.d, p2, z1.d, z5.d
+; CHECK-NEXT:    mov z4.d, z2.d
+; CHECK-NEXT:    sel z2.d, p4, z2.d, z5.d
+; CHECK-NEXT:    sel z3.d, p1, z3.d, z5.d
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    incd z4.d, all, mul #2
+; CHECK-NEXT:    umax z1.d, p1/m, z1.d, z3.d
+; CHECK-NEXT:    sel z4.d, p3, z4.d, z5.d
+; CHECK-NEXT:    umax z2.d, p1/m, z2.d, z4.d
+; CHECK-NEXT:    umax z1.d, p1/m, z1.d, z2.d
+; CHECK-NEXT:    umaxv d1, p1, z1.d
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    whilels p1.h, xzr, x8
+; CHECK-NEXT:    lastb w8, p1, z0.h
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    csel w0, w8, w0, ne
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
+  ret i16 %res
+}
+
+define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru) #0 {
+; CHECK-LABEL: extract_last_i32_scalable:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    mov z3.d, #0 // =0x0
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    sel z1.d, p2, z1.d, z3.d
+; CHECK-NEXT:    incd z2.d
+; CHECK-NEXT:    sel z2.d, p1, z2.d, z3.d
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    umax z1.d, p1/m, z1.d, z2.d
+; CHECK-NEXT:    umaxv d1, p1, z1.d
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    whilels p1.s, xzr, x8
+; CHECK-NEXT:    lastb w8, p1, z0.s
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    csel w0, w8, w0, ne
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru)
+  ret i32 %res
+}
+
+define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) #0 {
+; CHECK-LABEL: extract_last_i64_scalable:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    mov z2.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sel z1.d, p0, z1.d, z2.d
+; CHECK-NEXT:    umaxv d1, p1, z1.d
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    whilels p2.d, xzr, x8
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    lastb x8, p2, z0.d
+; CHECK-NEXT:    csel x0, x8, x0, ne
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru)
+  ret i64 %res
+}
+
+declare i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
+declare i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
+declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
+declare i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64>, <2 x i1>, i64)
+declare i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
+declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
+declare i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
+
+attributes #0 = { "target-features"="+sve" vscale_range(1, 16) }

>From 641715c419105dd9c18798a19509aa9a84a6856c Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 7 Nov 2024 17:20:20 +0000
Subject: [PATCH 2/3] Address review comments

---
 llvm/docs/LangRef.rst                         |  19 +-
 llvm/include/llvm/IR/Intrinsics.td            |   2 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  32 +-
 llvm/lib/IR/AutoUpgrade.cpp                   |   3 +
 .../AArch64/vector-extract-last-active.ll     | 420 +++++++++++
 .../CodeGen/AArch64/vector-masked-extract.ll  | 663 ------------------
 6 files changed, 456 insertions(+), 683 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/vector-masked-extract.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ef2965a1a19610..7f7eaa240c2956 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20002,18 +20002,15 @@ the follow sequence of operations:
 
 The ``mask`` operand will apply to at least the gather and scatter operations.
 
-'``llvm.experimental.vector.masked.extract.last.active``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.experimental.vector.extract.last.active``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This is an overloaded intrinsic.
 
-This intrinsic will extract the value from a single lane of a vector, based
-on a supplied mask vector.
-
 ::
 
-    declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru)
-    declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
+    declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru)
+    declare i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
 
 Arguments:
 """"""""""
@@ -20028,10 +20025,10 @@ the passthru value must match that of the elements of the data vector.
 Semantics:
 """"""""""
 
-The '``llvm.experimental.vector.masked.extract.last.active``' intrinsic will
-find the index of the most significant active lane in the mask vector, and
-extract the element at that index in the corresponding data vector. If no mask
-lanes are active then the passthru value is returned instead.
+The '``llvm.experimental.vector.extract.last.active``' intrinsic will extract an
+element from the data vector at the index matching the highest active lane of
+the mask vector. If no mask lanes are active then the passthru value is
+returned instead.
 
 .. _int_vector_compress:
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index bd2edd9f950369..02caa076d12c8f 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1921,7 +1921,7 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
                              [ IntrArgMemOnly ]>;
 
 // Extract based on mask bits
-def int_experimental_vector_masked_extract_last_active:
+def int_experimental_vector_extract_last_active:
     DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
               [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                LLVMVectorElementType<0>], [IntrNoMem]>;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ea0b0330e981b1..06755926841ac3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8207,7 +8207,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     visitVectorHistogram(I, Intrinsic);
     return;
   }
-  case Intrinsic::experimental_vector_masked_extract_last_active: {
+  case Intrinsic::experimental_vector_extract_last_active: {
     SDValue Data = getValue(I.getOperand(0));
     SDValue Mask = getValue(I.getOperand(1));
     SDValue PassThru = getValue(I.getOperand(2));
@@ -8215,16 +8215,32 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     EVT DataVT = Data.getValueType();
     EVT ScalarVT = PassThru.getValueType();
     EVT BoolVT = Mask.getValueType().getScalarType();
-    EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-    EVT IdxVecVT = DataVT.changeVectorElementType(IdxVT);
 
-    SDValue Zeroes = DAG.getConstant(0, sdl, IdxVecVT);
-    SDValue StepVec = DAG.getStepVector(sdl, IdxVecVT);
-    SDValue ActiveElts = DAG.getSelect(sdl, IdxVecVT, Mask, StepVec, Zeroes);
+    // Find a suitable type for a stepvector.
+    ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
+    if (DataVT.isScalableVector())
+      VScaleRange = getVScaleRange(I.getCaller(), 64);
+    unsigned EltWidth = TLI.getBitWidthForCttzElements(
+        I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true,
+        &VScaleRange);
+    MVT StepVT = MVT::getIntegerVT(EltWidth);
+    EVT StepVecVT = DataVT.changeVectorElementType(StepVT);
+
+    // Zero out lanes with inactive elements, then find the highest remaining
+    // value from the stepvector.
+    SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT);
+    SDValue StepVec = DAG.getStepVector(sdl, StepVecVT);
+    SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes);
     SDValue HighestIdx =
-        DAG.getNode(ISD::VECREDUCE_UMAX, sdl, IdxVT, ActiveElts);
+        DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts);
+
+    // Extract the corresponding lane from the data vector
+    EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+    SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT);
     SDValue Extract =
-        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, HighestIdx);
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx);
+
+    // If all mask lanes were inactive, choose the passthru value instead.
     SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
     SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru);
     setValue(&I, Result);
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d0e0da53307cf8..e73538da282e99 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1119,6 +1119,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
     if (Name.consume_front("experimental.vector.")) {
       Intrinsic::ID ID =
           StringSwitch<Intrinsic::ID>(Name)
+              // Skip over extract.last.active, otherwise it will be 'upgraded'
+              // to a regular vector extract which is a different operation.
+              .StartsWith("extract.last.active.", Intrinsic::not_intrinsic)
               .StartsWith("extract.", Intrinsic::vector_extract)
               .StartsWith("insert.", Intrinsic::vector_insert)
               .StartsWith("splice.", Intrinsic::vector_splice)
diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
new file mode 100644
index 00000000000000..c0f1720e1cf8b3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
@@ -0,0 +1,420 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,NEON-FIXED
+; RUN: llc -mtriple=aarch64 -mattr=+sve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SVE-FIXED
+
+define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i8:
+; NEON-FIXED:       // %bb.0:
+; NEON-FIXED-NEXT:    sub sp, sp, #16
+; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT:    cmeq v2.16b, v1.16b, #0
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_0
+; NEON-FIXED-NEXT:    cmtst v1.16b, v1.16b, v1.16b
+; NEON-FIXED-NEXT:    ldr q3, [x8, :lo12:.LCPI0_0]
+; NEON-FIXED-NEXT:    mov x9, sp
+; NEON-FIXED-NEXT:    str q0, [sp]
+; NEON-FIXED-NEXT:    bic v2.16b, v3.16b, v2.16b
+; NEON-FIXED-NEXT:    umaxv b1, v1.16b
+; NEON-FIXED-NEXT:    umaxv b2, v2.16b
+; NEON-FIXED-NEXT:    fmov w8, s2
+; NEON-FIXED-NEXT:    bfxil x9, x8, #0, #4
+; NEON-FIXED-NEXT:    ldrb w8, [x9]
+; NEON-FIXED-NEXT:    fmov w9, s1
+; NEON-FIXED-NEXT:    tst w9, #0x1
+; NEON-FIXED-NEXT:    csel w0, w8, w0, ne
+; NEON-FIXED-NEXT:    add sp, sp, #16
+; NEON-FIXED-NEXT:    ret
+;
+; SVE-FIXED-LABEL: extract_last_i8:
+; SVE-FIXED:       // %bb.0:
+; SVE-FIXED-NEXT:    sub sp, sp, #16
+; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT:    index z2.b, #0, #1
+; SVE-FIXED-NEXT:    cmeq v3.16b, v1.16b, #0
+; SVE-FIXED-NEXT:    cmtst v1.16b, v1.16b, v1.16b
+; SVE-FIXED-NEXT:    mov x9, sp
+; SVE-FIXED-NEXT:    str q0, [sp]
+; SVE-FIXED-NEXT:    bic v2.16b, v2.16b, v3.16b
+; SVE-FIXED-NEXT:    umaxv b1, v1.16b
+; SVE-FIXED-NEXT:    umaxv b2, v2.16b
+; SVE-FIXED-NEXT:    fmov w8, s2
+; SVE-FIXED-NEXT:    bfxil x9, x8, #0, #4
+; SVE-FIXED-NEXT:    ldrb w8, [x9]
+; SVE-FIXED-NEXT:    fmov w9, s1
+; SVE-FIXED-NEXT:    tst w9, #0x1
+; SVE-FIXED-NEXT:    csel w0, w8, w0, ne
+; SVE-FIXED-NEXT:    add sp, sp, #16
+; SVE-FIXED-NEXT:    ret
+  %notzero = icmp ne <16 x i8> %mask, zeroinitializer
+  %res = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %notzero, i8 %passthru)
+  ret i8 %res
+}
+
+define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i16:
+; NEON-FIXED:       // %bb.0:
+; NEON-FIXED-NEXT:    sub sp, sp, #16
+; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT:    cmtst v1.8h, v1.8h, v1.8h
+; NEON-FIXED-NEXT:    adrp x8, .LCPI1_0
+; NEON-FIXED-NEXT:    mov x9, sp
+; NEON-FIXED-NEXT:    ldr d2, [x8, :lo12:.LCPI1_0]
+; NEON-FIXED-NEXT:    str q0, [sp]
+; NEON-FIXED-NEXT:    xtn v1.8b, v1.8h
+; NEON-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
+; NEON-FIXED-NEXT:    umaxv b1, v1.8b
+; NEON-FIXED-NEXT:    umaxv b2, v2.8b
+; NEON-FIXED-NEXT:    fmov w8, s2
+; NEON-FIXED-NEXT:    bfi x9, x8, #1, #3
+; NEON-FIXED-NEXT:    ldrh w8, [x9]
+; NEON-FIXED-NEXT:    fmov w9, s1
+; NEON-FIXED-NEXT:    tst w9, #0x1
+; NEON-FIXED-NEXT:    csel w0, w8, w0, ne
+; NEON-FIXED-NEXT:    add sp, sp, #16
+; NEON-FIXED-NEXT:    ret
+;
+; SVE-FIXED-LABEL: extract_last_i16:
+; SVE-FIXED:       // %bb.0:
+; SVE-FIXED-NEXT:    sub sp, sp, #16
+; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT:    cmtst v1.8h, v1.8h, v1.8h
+; SVE-FIXED-NEXT:    index z2.b, #0, #1
+; SVE-FIXED-NEXT:    mov x9, sp
+; SVE-FIXED-NEXT:    str q0, [sp]
+; SVE-FIXED-NEXT:    xtn v1.8b, v1.8h
+; SVE-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
+; SVE-FIXED-NEXT:    umaxv b1, v1.8b
+; SVE-FIXED-NEXT:    umaxv b2, v2.8b
+; SVE-FIXED-NEXT:    fmov w8, s2
+; SVE-FIXED-NEXT:    bfi x9, x8, #1, #3
+; SVE-FIXED-NEXT:    ldrh w8, [x9]
+; SVE-FIXED-NEXT:    fmov w9, s1
+; SVE-FIXED-NEXT:    tst w9, #0x1
+; SVE-FIXED-NEXT:    csel w0, w8, w0, ne
+; SVE-FIXED-NEXT:    add sp, sp, #16
+; SVE-FIXED-NEXT:    ret
+  %notzero = icmp ne <8 x i16> %mask, zeroinitializer
+  %res = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> %data, <8 x i1> %notzero, i16 %passthru)
+  ret i16 %res
+}
+
+define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i32:
+; NEON-FIXED:       // %bb.0:
+; NEON-FIXED-NEXT:    sub sp, sp, #16
+; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT:    cmtst v1.4s, v1.4s, v1.4s
+; NEON-FIXED-NEXT:    adrp x8, .LCPI2_0
+; NEON-FIXED-NEXT:    mov x9, sp
+; NEON-FIXED-NEXT:    ldr d2, [x8, :lo12:.LCPI2_0]
+; NEON-FIXED-NEXT:    str q0, [sp]
+; NEON-FIXED-NEXT:    xtn v1.4h, v1.4s
+; NEON-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
+; NEON-FIXED-NEXT:    umaxv h1, v1.4h
+; NEON-FIXED-NEXT:    umaxv h2, v2.4h
+; NEON-FIXED-NEXT:    fmov w8, s2
+; NEON-FIXED-NEXT:    bfi x9, x8, #2, #2
+; NEON-FIXED-NEXT:    ldr w8, [x9]
+; NEON-FIXED-NEXT:    fmov w9, s1
+; NEON-FIXED-NEXT:    tst w9, #0x1
+; NEON-FIXED-NEXT:    csel w0, w8, w0, ne
+; NEON-FIXED-NEXT:    add sp, sp, #16
+; NEON-FIXED-NEXT:    ret
+;
+; SVE-FIXED-LABEL: extract_last_i32:
+; SVE-FIXED:       // %bb.0:
+; SVE-FIXED-NEXT:    sub sp, sp, #16
+; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT:    cmtst v1.4s, v1.4s, v1.4s
+; SVE-FIXED-NEXT:    index z2.h, #0, #1
+; SVE-FIXED-NEXT:    mov x9, sp
+; SVE-FIXED-NEXT:    str q0, [sp]
+; SVE-FIXED-NEXT:    xtn v1.4h, v1.4s
+; SVE-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
+; SVE-FIXED-NEXT:    umaxv h1, v1.4h
+; SVE-FIXED-NEXT:    umaxv h2, v2.4h
+; SVE-FIXED-NEXT:    fmov w8, s2
+; SVE-FIXED-NEXT:    bfi x9, x8, #2, #2
+; SVE-FIXED-NEXT:    ldr w8, [x9]
+; SVE-FIXED-NEXT:    fmov w9, s1
+; SVE-FIXED-NEXT:    tst w9, #0x1
+; SVE-FIXED-NEXT:    csel w0, w8, w0, ne
+; SVE-FIXED-NEXT:    add sp, sp, #16
+; SVE-FIXED-NEXT:    ret
+  %notzero = icmp ne <4 x i32> %mask, zeroinitializer
+  %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %notzero, i32 %passthru)
+  ret i32 %res
+}
+
+define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i64:
+; NEON-FIXED:       // %bb.0:
+; NEON-FIXED-NEXT:    sub sp, sp, #16
+; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT:    cmtst v1.2d, v1.2d, v1.2d
+; NEON-FIXED-NEXT:    adrp x8, .LCPI3_0
+; NEON-FIXED-NEXT:    mov x9, sp
+; NEON-FIXED-NEXT:    ldr d2, [x8, :lo12:.LCPI3_0]
+; NEON-FIXED-NEXT:    str q0, [sp]
+; NEON-FIXED-NEXT:    xtn v1.2s, v1.2d
+; NEON-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
+; NEON-FIXED-NEXT:    umaxp v1.2s, v1.2s, v1.2s
+; NEON-FIXED-NEXT:    umaxp v2.2s, v2.2s, v2.2s
+; NEON-FIXED-NEXT:    fmov w8, s2
+; NEON-FIXED-NEXT:    bfi x9, x8, #3, #1
+; NEON-FIXED-NEXT:    ldr x8, [x9]
+; NEON-FIXED-NEXT:    fmov w9, s1
+; NEON-FIXED-NEXT:    tst w9, #0x1
+; NEON-FIXED-NEXT:    csel x0, x8, x0, ne
+; NEON-FIXED-NEXT:    add sp, sp, #16
+; NEON-FIXED-NEXT:    ret
+;
+; SVE-FIXED-LABEL: extract_last_i64:
+; SVE-FIXED:       // %bb.0:
+; SVE-FIXED-NEXT:    sub sp, sp, #16
+; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT:    cmtst v1.2d, v1.2d, v1.2d
+; SVE-FIXED-NEXT:    index z2.s, #0, #1
+; SVE-FIXED-NEXT:    mov x9, sp
+; SVE-FIXED-NEXT:    str q0, [sp]
+; SVE-FIXED-NEXT:    xtn v1.2s, v1.2d
+; SVE-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
+; SVE-FIXED-NEXT:    umaxp v1.2s, v1.2s, v1.2s
+; SVE-FIXED-NEXT:    umaxp v2.2s, v2.2s, v2.2s
+; SVE-FIXED-NEXT:    fmov w8, s2
+; SVE-FIXED-NEXT:    bfi x9, x8, #3, #1
+; SVE-FIXED-NEXT:    ldr x8, [x9]
+; SVE-FIXED-NEXT:    fmov w9, s1
+; SVE-FIXED-NEXT:    tst w9, #0x1
+; SVE-FIXED-NEXT:    csel x0, x8, x0, ne
+; SVE-FIXED-NEXT:    add sp, sp, #16
+; SVE-FIXED-NEXT:    ret
+  %notzero = icmp ne <2 x i64> %mask, zeroinitializer
+  %res = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> %data, <2 x i1> %notzero, i64 %passthru)
+  ret i64 %res
+}
+
+define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %passthru) {
+; NEON-FIXED-LABEL: extract_last_float:
+; NEON-FIXED:       // %bb.0:
+; NEON-FIXED-NEXT:    sub sp, sp, #16
+; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT:    cmtst v1.4s, v1.4s, v1.4s
+; NEON-FIXED-NEXT:    adrp x8, .LCPI4_0
+; NEON-FIXED-NEXT:    mov x9, sp
+; NEON-FIXED-NEXT:    ldr d3, [x8, :lo12:.LCPI4_0]
+; NEON-FIXED-NEXT:    str q0, [sp]
+; NEON-FIXED-NEXT:    xtn v1.4h, v1.4s
+; NEON-FIXED-NEXT:    and v3.8b, v1.8b, v3.8b
+; NEON-FIXED-NEXT:    umaxv h1, v1.4h
+; NEON-FIXED-NEXT:    umaxv h3, v3.4h
+; NEON-FIXED-NEXT:    fmov w8, s3
+; NEON-FIXED-NEXT:    bfi x9, x8, #2, #2
+; NEON-FIXED-NEXT:    fmov w8, s1
+; NEON-FIXED-NEXT:    ldr s0, [x9]
+; NEON-FIXED-NEXT:    tst w8, #0x1
+; NEON-FIXED-NEXT:    fcsel s0, s0, s2, ne
+; NEON-FIXED-NEXT:    add sp, sp, #16
+; NEON-FIXED-NEXT:    ret
+;
+; SVE-FIXED-LABEL: extract_last_float:
+; SVE-FIXED:       // %bb.0:
+; SVE-FIXED-NEXT:    sub sp, sp, #16
+; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT:    cmtst v1.4s, v1.4s, v1.4s
+; SVE-FIXED-NEXT:    index z3.h, #0, #1
+; SVE-FIXED-NEXT:    mov x9, sp
+; SVE-FIXED-NEXT:    str q0, [sp]
+; SVE-FIXED-NEXT:    xtn v1.4h, v1.4s
+; SVE-FIXED-NEXT:    and v3.8b, v1.8b, v3.8b
+; SVE-FIXED-NEXT:    umaxv h1, v1.4h
+; SVE-FIXED-NEXT:    umaxv h3, v3.4h
+; SVE-FIXED-NEXT:    fmov w8, s3
+; SVE-FIXED-NEXT:    bfi x9, x8, #2, #2
+; SVE-FIXED-NEXT:    fmov w8, s1
+; SVE-FIXED-NEXT:    ldr s0, [x9]
+; SVE-FIXED-NEXT:    tst w8, #0x1
+; SVE-FIXED-NEXT:    fcsel s0, s0, s2, ne
+; SVE-FIXED-NEXT:    add sp, sp, #16
+; SVE-FIXED-NEXT:    ret
+  %notzero = icmp ne <4 x i32> %mask, zeroinitializer
+  %res = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> %data, <4 x i1> %notzero, float %passthru)
+  ret float %res
+}
+
+define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %passthru) {
+; NEON-FIXED-LABEL: extract_last_double:
+; NEON-FIXED:       // %bb.0:
+; NEON-FIXED-NEXT:    sub sp, sp, #16
+; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT:    cmtst v1.2d, v1.2d, v1.2d
+; NEON-FIXED-NEXT:    adrp x8, .LCPI5_0
+; NEON-FIXED-NEXT:    mov x9, sp
+; NEON-FIXED-NEXT:    ldr d3, [x8, :lo12:.LCPI5_0]
+; NEON-FIXED-NEXT:    str q0, [sp]
+; NEON-FIXED-NEXT:    xtn v1.2s, v1.2d
+; NEON-FIXED-NEXT:    and v3.8b, v1.8b, v3.8b
+; NEON-FIXED-NEXT:    umaxp v1.2s, v1.2s, v1.2s
+; NEON-FIXED-NEXT:    umaxp v3.2s, v3.2s, v3.2s
+; NEON-FIXED-NEXT:    fmov w8, s3
+; NEON-FIXED-NEXT:    bfi x9, x8, #3, #1
+; NEON-FIXED-NEXT:    fmov w8, s1
+; NEON-FIXED-NEXT:    ldr d0, [x9]
+; NEON-FIXED-NEXT:    tst w8, #0x1
+; NEON-FIXED-NEXT:    fcsel d0, d0, d2, ne
+; NEON-FIXED-NEXT:    add sp, sp, #16
+; NEON-FIXED-NEXT:    ret
+;
+; SVE-FIXED-LABEL: extract_last_double:
+; SVE-FIXED:       // %bb.0:
+; SVE-FIXED-NEXT:    sub sp, sp, #16
+; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT:    cmtst v1.2d, v1.2d, v1.2d
+; SVE-FIXED-NEXT:    index z3.s, #0, #1
+; SVE-FIXED-NEXT:    mov x9, sp
+; SVE-FIXED-NEXT:    str q0, [sp]
+; SVE-FIXED-NEXT:    xtn v1.2s, v1.2d
+; SVE-FIXED-NEXT:    and v3.8b, v1.8b, v3.8b
+; SVE-FIXED-NEXT:    umaxp v1.2s, v1.2s, v1.2s
+; SVE-FIXED-NEXT:    umaxp v3.2s, v3.2s, v3.2s
+; SVE-FIXED-NEXT:    fmov w8, s3
+; SVE-FIXED-NEXT:    bfi x9, x8, #3, #1
+; SVE-FIXED-NEXT:    fmov w8, s1
+; SVE-FIXED-NEXT:    ldr d0, [x9]
+; SVE-FIXED-NEXT:    tst w8, #0x1
+; SVE-FIXED-NEXT:    fcsel d0, d0, d2, ne
+; SVE-FIXED-NEXT:    add sp, sp, #16
+; SVE-FIXED-NEXT:    ret
+  %notzero = icmp ne <2 x i64> %mask, zeroinitializer
+  %res = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> %data, <2 x i1> %notzero, double %passthru)
+  ret double %res
+}
+
+define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) #0 {
+; CHECK-LABEL: extract_last_i8_scalable:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    mov z2.b, #0 // =0x0
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    sel z1.b, p0, z1.b, z2.b
+; CHECK-NEXT:    umaxv b1, p1, z1.b
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    and x8, x8, #0xff
+; CHECK-NEXT:    whilels p1.b, xzr, x8
+; CHECK-NEXT:    ptest p0, p0.b
+; CHECK-NEXT:    lastb w8, p1, z0.b
+; CHECK-NEXT:    csel w0, w8, w0, ne
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru)
+  ret i8 %res
+}
+
+define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) #0 {
+; CHECK-LABEL: extract_last_i16_scalable:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z1.h, #0, #1
+; CHECK-NEXT:    mov z2.h, #0 // =0x0
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    sel z1.h, p0, z1.h, z2.h
+; CHECK-NEXT:    umaxv h1, p1, z1.h
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    and x8, x8, #0xff
+; CHECK-NEXT:    whilels p2.h, xzr, x8
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    lastb w8, p2, z0.h
+; CHECK-NEXT:    csel w0, w8, w0, ne
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
+  ret i16 %res
+}
+
+define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru) #0 {
+; CHECK-LABEL: extract_last_i32_scalable:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    mov z2.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    sel z1.s, p0, z1.s, z2.s
+; CHECK-NEXT:    umaxv s1, p1, z1.s
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    and x8, x8, #0xff
+; CHECK-NEXT:    whilels p2.s, xzr, x8
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    lastb w8, p2, z0.s
+; CHECK-NEXT:    csel w0, w8, w0, ne
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru)
+  ret i32 %res
+}
+
+define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) #0 {
+; CHECK-LABEL: extract_last_i64_scalable:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    mov z2.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sel z1.d, p0, z1.d, z2.d
+; CHECK-NEXT:    umaxv d1, p1, z1.d
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    and x8, x8, #0xff
+; CHECK-NEXT:    whilels p2.d, xzr, x8
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    lastb x8, p2, z0.d
+; CHECK-NEXT:    csel x0, x8, x0, ne
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru)
+  ret i64 %res
+}
+
+define float @extract_last_float_scalable(<vscale x 4 x float> %data, <vscale x 4 x i1> %mask, float %passthru) #0 {
+; CHECK-LABEL: extract_last_float_scalable:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z2.s, #0, #1
+; CHECK-NEXT:    mov z3.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    sel z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    umaxv s2, p1, z2.s
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    and x8, x8, #0xff
+; CHECK-NEXT:    whilels p2.s, xzr, x8
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    lastb s0, p2, z0.s
+; CHECK-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-NEXT:    ret
+  %res = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x i1> %mask, float %passthru)
+  ret float %res
+}
+
+define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru) #0 {
+; CHECK-LABEL: extract_last_double_scalable:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z2.d, #0, #1
+; CHECK-NEXT:    mov z3.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sel z2.d, p0, z2.d, z3.d
+; CHECK-NEXT:    umaxv d2, p1, z2.d
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    and x8, x8, #0xff
+; CHECK-NEXT:    whilels p2.d, xzr, x8
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    lastb d0, p2, z0.d
+; CHECK-NEXT:    fcsel d0, d0, d1, ne
+; CHECK-NEXT:    ret
+  %res = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru)
+  ret double %res
+}
+
+declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
+declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
+declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
+declare i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64>, <2 x i1>, i64)
+declare float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float>, <4 x i1>, float)
+declare double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double>, <2 x i1>, double)
+declare i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
+declare i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
+declare i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
+declare float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float)
+declare double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
+
+attributes #0 = { "target-features"="+sve" vscale_range(1, 16) }
diff --git a/llvm/test/CodeGen/AArch64/vector-masked-extract.ll b/llvm/test/CodeGen/AArch64/vector-masked-extract.ll
deleted file mode 100644
index 04adf4e476b041..00000000000000
--- a/llvm/test/CodeGen/AArch64/vector-masked-extract.ll
+++ /dev/null
@@ -1,663 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,NEON-FIXED
-; RUN: llc -mtriple=aarch64 -mattr=+sve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SVE-FIXED
-
-define i8 @extract_last_i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru) {
-; NEON-FIXED-LABEL: extract_last_i8:
-; NEON-FIXED:       // %bb.0:
-; NEON-FIXED-NEXT:    sub sp, sp, #16
-; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
-; NEON-FIXED-NEXT:    umov w15, v1.b[14]
-; NEON-FIXED-NEXT:    umov w14, v1.b[6]
-; NEON-FIXED-NEXT:    adrp x8, .LCPI0_0
-; NEON-FIXED-NEXT:    umov w12, v1.b[15]
-; NEON-FIXED-NEXT:    umov w13, v1.b[10]
-; NEON-FIXED-NEXT:    ldr q2, [x8, :lo12:.LCPI0_0]
-; NEON-FIXED-NEXT:    umov w11, v1.b[2]
-; NEON-FIXED-NEXT:    umov w8, v1.b[7]
-; NEON-FIXED-NEXT:    str q0, [sp]
-; NEON-FIXED-NEXT:    umov w9, v1.b[11]
-; NEON-FIXED-NEXT:    umov w10, v1.b[3]
-; NEON-FIXED-NEXT:    umov w16, v1.b[12]
-; NEON-FIXED-NEXT:    fmov s3, w15
-; NEON-FIXED-NEXT:    umov w15, v1.b[4]
-; NEON-FIXED-NEXT:    fmov s4, w14
-; NEON-FIXED-NEXT:    fmov s5, w13
-; NEON-FIXED-NEXT:    umov w13, v1.b[0]
-; NEON-FIXED-NEXT:    umov w14, v1.b[13]
-; NEON-FIXED-NEXT:    fmov s6, w11
-; NEON-FIXED-NEXT:    umov w11, v1.b[5]
-; NEON-FIXED-NEXT:    mov v3.s[1], w12
-; NEON-FIXED-NEXT:    umov w12, v1.b[8]
-; NEON-FIXED-NEXT:    mov v4.s[1], w8
-; NEON-FIXED-NEXT:    umov w8, v1.b[9]
-; NEON-FIXED-NEXT:    mov v5.s[1], w9
-; NEON-FIXED-NEXT:    umov w9, v1.b[1]
-; NEON-FIXED-NEXT:    fmov s7, w16
-; NEON-FIXED-NEXT:    fmov s16, w15
-; NEON-FIXED-NEXT:    mov v6.s[1], w10
-; NEON-FIXED-NEXT:    fmov s18, w13
-; NEON-FIXED-NEXT:    shl v1.16b, v1.16b, #7
-; NEON-FIXED-NEXT:    fmov s17, w12
-; NEON-FIXED-NEXT:    ushll v3.2d, v3.2s, #0
-; NEON-FIXED-NEXT:    ushll v4.2d, v4.2s, #0
-; NEON-FIXED-NEXT:    mov v7.s[1], w14
-; NEON-FIXED-NEXT:    mov v16.s[1], w11
-; NEON-FIXED-NEXT:    ushll v5.2d, v5.2s, #0
-; NEON-FIXED-NEXT:    mov v18.s[1], w9
-; NEON-FIXED-NEXT:    adrp x9, .LCPI0_2
-; NEON-FIXED-NEXT:    ushll v6.2d, v6.2s, #0
-; NEON-FIXED-NEXT:    ldr q20, [x9, :lo12:.LCPI0_2]
-; NEON-FIXED-NEXT:    adrp x9, .LCPI0_7
-; NEON-FIXED-NEXT:    mov v17.s[1], w8
-; NEON-FIXED-NEXT:    adrp x8, .LCPI0_1
-; NEON-FIXED-NEXT:    ldr q23, [x9, :lo12:.LCPI0_7]
-; NEON-FIXED-NEXT:    mov x9, sp
-; NEON-FIXED-NEXT:    ldr q19, [x8, :lo12:.LCPI0_1]
-; NEON-FIXED-NEXT:    adrp x8, .LCPI0_3
-; NEON-FIXED-NEXT:    shl v3.2d, v3.2d, #63
-; NEON-FIXED-NEXT:    shl v4.2d, v4.2d, #63
-; NEON-FIXED-NEXT:    ushll v7.2d, v7.2s, #0
-; NEON-FIXED-NEXT:    shl v5.2d, v5.2d, #63
-; NEON-FIXED-NEXT:    ushll v16.2d, v16.2s, #0
-; NEON-FIXED-NEXT:    ushll v17.2d, v17.2s, #0
-; NEON-FIXED-NEXT:    shl v6.2d, v6.2d, #63
-; NEON-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
-; NEON-FIXED-NEXT:    ushll v18.2d, v18.2s, #0
-; NEON-FIXED-NEXT:    cmlt v1.16b, v1.16b, #0
-; NEON-FIXED-NEXT:    cmlt v4.2d, v4.2d, #0
-; NEON-FIXED-NEXT:    cmlt v5.2d, v5.2d, #0
-; NEON-FIXED-NEXT:    cmlt v6.2d, v6.2d, #0
-; NEON-FIXED-NEXT:    and v2.16b, v3.16b, v2.16b
-; NEON-FIXED-NEXT:    shl v3.2d, v7.2d, #63
-; NEON-FIXED-NEXT:    shl v7.2d, v16.2d, #63
-; NEON-FIXED-NEXT:    shl v16.2d, v17.2d, #63
-; NEON-FIXED-NEXT:    ldr q17, [x8, :lo12:.LCPI0_3]
-; NEON-FIXED-NEXT:    adrp x8, .LCPI0_4
-; NEON-FIXED-NEXT:    ldr q21, [x8, :lo12:.LCPI0_4]
-; NEON-FIXED-NEXT:    adrp x8, .LCPI0_5
-; NEON-FIXED-NEXT:    shl v18.2d, v18.2d, #63
-; NEON-FIXED-NEXT:    ldr q22, [x8, :lo12:.LCPI0_5]
-; NEON-FIXED-NEXT:    adrp x8, .LCPI0_6
-; NEON-FIXED-NEXT:    and v4.16b, v4.16b, v19.16b
-; NEON-FIXED-NEXT:    ldr q19, [x8, :lo12:.LCPI0_6]
-; NEON-FIXED-NEXT:    cmlt v16.2d, v16.2d, #0
-; NEON-FIXED-NEXT:    and v5.16b, v5.16b, v20.16b
-; NEON-FIXED-NEXT:    cmlt v18.2d, v18.2d, #0
-; NEON-FIXED-NEXT:    and v6.16b, v6.16b, v17.16b
-; NEON-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
-; NEON-FIXED-NEXT:    cmlt v7.2d, v7.2d, #0
-; NEON-FIXED-NEXT:    umaxv b1, v1.16b
-; NEON-FIXED-NEXT:    and v16.16b, v16.16b, v19.16b
-; NEON-FIXED-NEXT:    and v17.16b, v18.16b, v23.16b
-; NEON-FIXED-NEXT:    cmhi v18.2d, v4.2d, v2.2d
-; NEON-FIXED-NEXT:    cmhi v19.2d, v6.2d, v5.2d
-; NEON-FIXED-NEXT:    and v3.16b, v3.16b, v21.16b
-; NEON-FIXED-NEXT:    and v7.16b, v7.16b, v22.16b
-; NEON-FIXED-NEXT:    cmhi v21.2d, v17.2d, v16.2d
-; NEON-FIXED-NEXT:    bit v2.16b, v4.16b, v18.16b
-; NEON-FIXED-NEXT:    mov v4.16b, v19.16b
-; NEON-FIXED-NEXT:    cmhi v20.2d, v7.2d, v3.2d
-; NEON-FIXED-NEXT:    bsl v4.16b, v6.16b, v5.16b
-; NEON-FIXED-NEXT:    mov v5.16b, v21.16b
-; NEON-FIXED-NEXT:    bit v3.16b, v7.16b, v20.16b
-; NEON-FIXED-NEXT:    bsl v5.16b, v17.16b, v16.16b
-; NEON-FIXED-NEXT:    cmhi v6.2d, v4.2d, v2.2d
-; NEON-FIXED-NEXT:    cmhi v7.2d, v5.2d, v3.2d
-; NEON-FIXED-NEXT:    bit v2.16b, v4.16b, v6.16b
-; NEON-FIXED-NEXT:    bit v3.16b, v5.16b, v7.16b
-; NEON-FIXED-NEXT:    cmhi v4.2d, v3.2d, v2.2d
-; NEON-FIXED-NEXT:    bit v2.16b, v3.16b, v4.16b
-; NEON-FIXED-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
-; NEON-FIXED-NEXT:    cmhi d4, d2, d3
-; NEON-FIXED-NEXT:    bif v2.8b, v3.8b, v4.8b
-; NEON-FIXED-NEXT:    fmov x8, d2
-; NEON-FIXED-NEXT:    bfxil x9, x8, #0, #4
-; NEON-FIXED-NEXT:    ldrb w8, [x9]
-; NEON-FIXED-NEXT:    fmov w9, s1
-; NEON-FIXED-NEXT:    tst w9, #0x1
-; NEON-FIXED-NEXT:    csel w0, w8, w0, ne
-; NEON-FIXED-NEXT:    add sp, sp, #16
-; NEON-FIXED-NEXT:    ret
-;
-; SVE-FIXED-LABEL: extract_last_i8:
-; SVE-FIXED:       // %bb.0:
-; SVE-FIXED-NEXT:    sub sp, sp, #16
-; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
-; SVE-FIXED-NEXT:    umov w8, v1.b[14]
-; SVE-FIXED-NEXT:    umov w9, v1.b[6]
-; SVE-FIXED-NEXT:    index z2.d, #0, #1
-; SVE-FIXED-NEXT:    umov w12, v1.b[2]
-; SVE-FIXED-NEXT:    umov w10, v1.b[10]
-; SVE-FIXED-NEXT:    str q0, [sp]
-; SVE-FIXED-NEXT:    umov w13, v1.b[12]
-; SVE-FIXED-NEXT:    umov w11, v1.b[15]
-; SVE-FIXED-NEXT:    umov w14, v1.b[4]
-; SVE-FIXED-NEXT:    umov w16, v1.b[0]
-; SVE-FIXED-NEXT:    umov w15, v1.b[8]
-; SVE-FIXED-NEXT:    fmov s3, w8
-; SVE-FIXED-NEXT:    umov w8, v1.b[7]
-; SVE-FIXED-NEXT:    fmov s4, w9
-; SVE-FIXED-NEXT:    umov w9, v1.b[11]
-; SVE-FIXED-NEXT:    fmov s6, w12
-; SVE-FIXED-NEXT:    umov w12, v1.b[3]
-; SVE-FIXED-NEXT:    fmov s5, w10
-; SVE-FIXED-NEXT:    umov w10, v1.b[1]
-; SVE-FIXED-NEXT:    fmov s7, w13
-; SVE-FIXED-NEXT:    umov w13, v1.b[13]
-; SVE-FIXED-NEXT:    fmov s16, w14
-; SVE-FIXED-NEXT:    fmov s18, w16
-; SVE-FIXED-NEXT:    mov v4.s[1], w8
-; SVE-FIXED-NEXT:    umov w8, v1.b[5]
-; SVE-FIXED-NEXT:    mov v3.s[1], w11
-; SVE-FIXED-NEXT:    mov v5.s[1], w9
-; SVE-FIXED-NEXT:    mov v6.s[1], w12
-; SVE-FIXED-NEXT:    umov w9, v1.b[9]
-; SVE-FIXED-NEXT:    fmov s17, w15
-; SVE-FIXED-NEXT:    mov v18.s[1], w10
-; SVE-FIXED-NEXT:    mov z19.d, z2.d
-; SVE-FIXED-NEXT:    mov v7.s[1], w13
-; SVE-FIXED-NEXT:    mov z20.d, z2.d
-; SVE-FIXED-NEXT:    mov z21.d, z2.d
-; SVE-FIXED-NEXT:    mov v16.s[1], w8
-; SVE-FIXED-NEXT:    ushll v3.2d, v3.2s, #0
-; SVE-FIXED-NEXT:    ushll v4.2d, v4.2s, #0
-; SVE-FIXED-NEXT:    ushll v5.2d, v5.2s, #0
-; SVE-FIXED-NEXT:    ushll v6.2d, v6.2s, #0
-; SVE-FIXED-NEXT:    mov v17.s[1], w9
-; SVE-FIXED-NEXT:    mov x9, sp
-; SVE-FIXED-NEXT:    ushll v18.2d, v18.2s, #0
-; SVE-FIXED-NEXT:    mov z25.d, z2.d
-; SVE-FIXED-NEXT:    ushll v7.2d, v7.2s, #0
-; SVE-FIXED-NEXT:    shl v3.2d, v3.2d, #63
-; SVE-FIXED-NEXT:    shl v4.2d, v4.2d, #63
-; SVE-FIXED-NEXT:    ushll v16.2d, v16.2s, #0
-; SVE-FIXED-NEXT:    shl v5.2d, v5.2d, #63
-; SVE-FIXED-NEXT:    shl v6.2d, v6.2d, #63
-; SVE-FIXED-NEXT:    mov z22.d, z2.d
-; SVE-FIXED-NEXT:    mov z23.d, z2.d
-; SVE-FIXED-NEXT:    add z19.d, z19.d, #6 // =0x6
-; SVE-FIXED-NEXT:    shl v18.2d, v18.2d, #63
-; SVE-FIXED-NEXT:    ushll v17.2d, v17.2s, #0
-; SVE-FIXED-NEXT:    shl v7.2d, v7.2d, #63
-; SVE-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
-; SVE-FIXED-NEXT:    cmlt v4.2d, v4.2d, #0
-; SVE-FIXED-NEXT:    add z25.d, z25.d, #14 // =0xe
-; SVE-FIXED-NEXT:    shl v16.2d, v16.2d, #63
-; SVE-FIXED-NEXT:    cmlt v5.2d, v5.2d, #0
-; SVE-FIXED-NEXT:    add z20.d, z20.d, #10 // =0xa
-; SVE-FIXED-NEXT:    cmlt v6.2d, v6.2d, #0
-; SVE-FIXED-NEXT:    add z21.d, z21.d, #2 // =0x2
-; SVE-FIXED-NEXT:    mov z24.d, z2.d
-; SVE-FIXED-NEXT:    shl v17.2d, v17.2d, #63
-; SVE-FIXED-NEXT:    cmlt v18.2d, v18.2d, #0
-; SVE-FIXED-NEXT:    cmlt v7.2d, v7.2d, #0
-; SVE-FIXED-NEXT:    add z22.d, z22.d, #12 // =0xc
-; SVE-FIXED-NEXT:    cmlt v16.2d, v16.2d, #0
-; SVE-FIXED-NEXT:    add z23.d, z23.d, #4 // =0x4
-; SVE-FIXED-NEXT:    and v3.16b, v3.16b, v25.16b
-; SVE-FIXED-NEXT:    and v4.16b, v4.16b, v19.16b
-; SVE-FIXED-NEXT:    and v5.16b, v5.16b, v20.16b
-; SVE-FIXED-NEXT:    and v6.16b, v6.16b, v21.16b
-; SVE-FIXED-NEXT:    cmlt v17.2d, v17.2d, #0
-; SVE-FIXED-NEXT:    add z24.d, z24.d, #8 // =0x8
-; SVE-FIXED-NEXT:    and v2.16b, v18.16b, v2.16b
-; SVE-FIXED-NEXT:    and v7.16b, v7.16b, v22.16b
-; SVE-FIXED-NEXT:    and v16.16b, v16.16b, v23.16b
-; SVE-FIXED-NEXT:    cmhi v18.2d, v4.2d, v3.2d
-; SVE-FIXED-NEXT:    shl v1.16b, v1.16b, #7
-; SVE-FIXED-NEXT:    cmhi v19.2d, v6.2d, v5.2d
-; SVE-FIXED-NEXT:    and v17.16b, v17.16b, v24.16b
-; SVE-FIXED-NEXT:    cmhi v20.2d, v16.2d, v7.2d
-; SVE-FIXED-NEXT:    bit v3.16b, v4.16b, v18.16b
-; SVE-FIXED-NEXT:    cmlt v1.16b, v1.16b, #0
-; SVE-FIXED-NEXT:    mov v4.16b, v19.16b
-; SVE-FIXED-NEXT:    cmhi v21.2d, v2.2d, v17.2d
-; SVE-FIXED-NEXT:    umaxv b1, v1.16b
-; SVE-FIXED-NEXT:    bsl v4.16b, v6.16b, v5.16b
-; SVE-FIXED-NEXT:    mov v5.16b, v20.16b
-; SVE-FIXED-NEXT:    bif v2.16b, v17.16b, v21.16b
-; SVE-FIXED-NEXT:    bsl v5.16b, v16.16b, v7.16b
-; SVE-FIXED-NEXT:    cmhi v6.2d, v4.2d, v3.2d
-; SVE-FIXED-NEXT:    cmhi v7.2d, v2.2d, v5.2d
-; SVE-FIXED-NEXT:    bit v3.16b, v4.16b, v6.16b
-; SVE-FIXED-NEXT:    bif v2.16b, v5.16b, v7.16b
-; SVE-FIXED-NEXT:    cmhi v4.2d, v2.2d, v3.2d
-; SVE-FIXED-NEXT:    bif v2.16b, v3.16b, v4.16b
-; SVE-FIXED-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
-; SVE-FIXED-NEXT:    cmhi d4, d2, d3
-; SVE-FIXED-NEXT:    bif v2.8b, v3.8b, v4.8b
-; SVE-FIXED-NEXT:    fmov x8, d2
-; SVE-FIXED-NEXT:    bfxil x9, x8, #0, #4
-; SVE-FIXED-NEXT:    ldrb w8, [x9]
-; SVE-FIXED-NEXT:    fmov w9, s1
-; SVE-FIXED-NEXT:    tst w9, #0x1
-; SVE-FIXED-NEXT:    csel w0, w8, w0, ne
-; SVE-FIXED-NEXT:    add sp, sp, #16
-; SVE-FIXED-NEXT:    ret
-  %res = call i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru)
-  ret i8 %res
-}
-
-define i16 @extract_last_i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru) {
-; NEON-FIXED-LABEL: extract_last_i16:
-; NEON-FIXED:       // %bb.0:
-; NEON-FIXED-NEXT:    sub sp, sp, #16
-; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
-; NEON-FIXED-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NEON-FIXED-NEXT:    umov w8, v1.b[6]
-; NEON-FIXED-NEXT:    umov w9, v1.b[2]
-; NEON-FIXED-NEXT:    str q0, [sp]
-; NEON-FIXED-NEXT:    umov w11, v1.b[4]
-; NEON-FIXED-NEXT:    umov w12, v1.b[0]
-; NEON-FIXED-NEXT:    umov w10, v1.b[7]
-; NEON-FIXED-NEXT:    umov w13, v1.b[3]
-; NEON-FIXED-NEXT:    umov w14, v1.b[5]
-; NEON-FIXED-NEXT:    umov w15, v1.b[1]
-; NEON-FIXED-NEXT:    shl v1.8b, v1.8b, #7
-; NEON-FIXED-NEXT:    fmov s2, w8
-; NEON-FIXED-NEXT:    adrp x8, .LCPI1_0
-; NEON-FIXED-NEXT:    fmov s3, w9
-; NEON-FIXED-NEXT:    fmov s4, w11
-; NEON-FIXED-NEXT:    adrp x9, .LCPI1_1
-; NEON-FIXED-NEXT:    ldr q6, [x8, :lo12:.LCPI1_0]
-; NEON-FIXED-NEXT:    fmov s5, w12
-; NEON-FIXED-NEXT:    adrp x8, .LCPI1_3
-; NEON-FIXED-NEXT:    ldr q7, [x9, :lo12:.LCPI1_1]
-; NEON-FIXED-NEXT:    mov v2.s[1], w10
-; NEON-FIXED-NEXT:    mov v3.s[1], w13
-; NEON-FIXED-NEXT:    adrp x10, .LCPI1_2
-; NEON-FIXED-NEXT:    mov v4.s[1], w14
-; NEON-FIXED-NEXT:    ldr q16, [x10, :lo12:.LCPI1_2]
-; NEON-FIXED-NEXT:    ldr q17, [x8, :lo12:.LCPI1_3]
-; NEON-FIXED-NEXT:    mov v5.s[1], w15
-; NEON-FIXED-NEXT:    cmlt v1.8b, v1.8b, #0
-; NEON-FIXED-NEXT:    mov x9, sp
-; NEON-FIXED-NEXT:    ushll v2.2d, v2.2s, #0
-; NEON-FIXED-NEXT:    ushll v3.2d, v3.2s, #0
-; NEON-FIXED-NEXT:    ushll v4.2d, v4.2s, #0
-; NEON-FIXED-NEXT:    umaxv b1, v1.8b
-; NEON-FIXED-NEXT:    ushll v5.2d, v5.2s, #0
-; NEON-FIXED-NEXT:    shl v2.2d, v2.2d, #63
-; NEON-FIXED-NEXT:    shl v3.2d, v3.2d, #63
-; NEON-FIXED-NEXT:    shl v4.2d, v4.2d, #63
-; NEON-FIXED-NEXT:    shl v5.2d, v5.2d, #63
-; NEON-FIXED-NEXT:    cmlt v2.2d, v2.2d, #0
-; NEON-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
-; NEON-FIXED-NEXT:    cmlt v4.2d, v4.2d, #0
-; NEON-FIXED-NEXT:    cmlt v5.2d, v5.2d, #0
-; NEON-FIXED-NEXT:    and v2.16b, v2.16b, v6.16b
-; NEON-FIXED-NEXT:    and v3.16b, v3.16b, v7.16b
-; NEON-FIXED-NEXT:    and v4.16b, v4.16b, v16.16b
-; NEON-FIXED-NEXT:    and v5.16b, v5.16b, v17.16b
-; NEON-FIXED-NEXT:    cmhi v6.2d, v3.2d, v2.2d
-; NEON-FIXED-NEXT:    cmhi v7.2d, v5.2d, v4.2d
-; NEON-FIXED-NEXT:    bit v2.16b, v3.16b, v6.16b
-; NEON-FIXED-NEXT:    mov v3.16b, v7.16b
-; NEON-FIXED-NEXT:    bsl v3.16b, v5.16b, v4.16b
-; NEON-FIXED-NEXT:    cmhi v4.2d, v3.2d, v2.2d
-; NEON-FIXED-NEXT:    bit v2.16b, v3.16b, v4.16b
-; NEON-FIXED-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
-; NEON-FIXED-NEXT:    cmhi d4, d2, d3
-; NEON-FIXED-NEXT:    bif v2.8b, v3.8b, v4.8b
-; NEON-FIXED-NEXT:    fmov x8, d2
-; NEON-FIXED-NEXT:    bfi x9, x8, #1, #3
-; NEON-FIXED-NEXT:    ldrh w8, [x9]
-; NEON-FIXED-NEXT:    fmov w9, s1
-; NEON-FIXED-NEXT:    tst w9, #0x1
-; NEON-FIXED-NEXT:    csel w0, w8, w0, ne
-; NEON-FIXED-NEXT:    add sp, sp, #16
-; NEON-FIXED-NEXT:    ret
-;
-; SVE-FIXED-LABEL: extract_last_i16:
-; SVE-FIXED:       // %bb.0:
-; SVE-FIXED-NEXT:    sub sp, sp, #16
-; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
-; SVE-FIXED-NEXT:    // kill: def $d1 killed $d1 def $q1
-; SVE-FIXED-NEXT:    umov w8, v1.b[0]
-; SVE-FIXED-NEXT:    umov w10, v1.b[6]
-; SVE-FIXED-NEXT:    index z6.d, #0, #1
-; SVE-FIXED-NEXT:    umov w11, v1.b[2]
-; SVE-FIXED-NEXT:    umov w14, v1.b[4]
-; SVE-FIXED-NEXT:    str q0, [sp]
-; SVE-FIXED-NEXT:    umov w9, v1.b[1]
-; SVE-FIXED-NEXT:    umov w12, v1.b[7]
-; SVE-FIXED-NEXT:    umov w13, v1.b[3]
-; SVE-FIXED-NEXT:    fmov s2, w8
-; SVE-FIXED-NEXT:    umov w8, v1.b[5]
-; SVE-FIXED-NEXT:    fmov s3, w10
-; SVE-FIXED-NEXT:    fmov s4, w11
-; SVE-FIXED-NEXT:    fmov s5, w14
-; SVE-FIXED-NEXT:    mov z7.d, z6.d
-; SVE-FIXED-NEXT:    mov z16.d, z6.d
-; SVE-FIXED-NEXT:    mov z17.d, z6.d
-; SVE-FIXED-NEXT:    shl v1.8b, v1.8b, #7
-; SVE-FIXED-NEXT:    mov v2.s[1], w9
-; SVE-FIXED-NEXT:    mov x9, sp
-; SVE-FIXED-NEXT:    mov v3.s[1], w12
-; SVE-FIXED-NEXT:    mov v4.s[1], w13
-; SVE-FIXED-NEXT:    mov v5.s[1], w8
-; SVE-FIXED-NEXT:    add z7.d, z7.d, #2 // =0x2
-; SVE-FIXED-NEXT:    add z17.d, z17.d, #6 // =0x6
-; SVE-FIXED-NEXT:    add z16.d, z16.d, #4 // =0x4
-; SVE-FIXED-NEXT:    cmlt v1.8b, v1.8b, #0
-; SVE-FIXED-NEXT:    ushll v2.2d, v2.2s, #0
-; SVE-FIXED-NEXT:    ushll v3.2d, v3.2s, #0
-; SVE-FIXED-NEXT:    ushll v4.2d, v4.2s, #0
-; SVE-FIXED-NEXT:    ushll v5.2d, v5.2s, #0
-; SVE-FIXED-NEXT:    umaxv b1, v1.8b
-; SVE-FIXED-NEXT:    shl v2.2d, v2.2d, #63
-; SVE-FIXED-NEXT:    shl v3.2d, v3.2d, #63
-; SVE-FIXED-NEXT:    shl v4.2d, v4.2d, #63
-; SVE-FIXED-NEXT:    shl v5.2d, v5.2d, #63
-; SVE-FIXED-NEXT:    cmlt v2.2d, v2.2d, #0
-; SVE-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
-; SVE-FIXED-NEXT:    cmlt v4.2d, v4.2d, #0
-; SVE-FIXED-NEXT:    cmlt v5.2d, v5.2d, #0
-; SVE-FIXED-NEXT:    and v2.16b, v2.16b, v6.16b
-; SVE-FIXED-NEXT:    and v3.16b, v3.16b, v17.16b
-; SVE-FIXED-NEXT:    and v4.16b, v4.16b, v7.16b
-; SVE-FIXED-NEXT:    and v5.16b, v5.16b, v16.16b
-; SVE-FIXED-NEXT:    cmhi v6.2d, v4.2d, v3.2d
-; SVE-FIXED-NEXT:    cmhi v7.2d, v2.2d, v5.2d
-; SVE-FIXED-NEXT:    bit v3.16b, v4.16b, v6.16b
-; SVE-FIXED-NEXT:    bif v2.16b, v5.16b, v7.16b
-; SVE-FIXED-NEXT:    cmhi v4.2d, v2.2d, v3.2d
-; SVE-FIXED-NEXT:    bif v2.16b, v3.16b, v4.16b
-; SVE-FIXED-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
-; SVE-FIXED-NEXT:    cmhi d4, d2, d3
-; SVE-FIXED-NEXT:    bif v2.8b, v3.8b, v4.8b
-; SVE-FIXED-NEXT:    fmov x8, d2
-; SVE-FIXED-NEXT:    bfi x9, x8, #1, #3
-; SVE-FIXED-NEXT:    ldrh w8, [x9]
-; SVE-FIXED-NEXT:    fmov w9, s1
-; SVE-FIXED-NEXT:    tst w9, #0x1
-; SVE-FIXED-NEXT:    csel w0, w8, w0, ne
-; SVE-FIXED-NEXT:    add sp, sp, #16
-; SVE-FIXED-NEXT:    ret
-  %res = call i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru)
-  ret i16 %res
-}
-
-define i32 @extract_last_i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru) {
-; NEON-FIXED-LABEL: extract_last_i32:
-; NEON-FIXED:       // %bb.0:
-; NEON-FIXED-NEXT:    sub sp, sp, #16
-; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
-; NEON-FIXED-NEXT:    ushll v2.4s, v1.4h, #0
-; NEON-FIXED-NEXT:    adrp x8, .LCPI2_0
-; NEON-FIXED-NEXT:    adrp x9, .LCPI2_1
-; NEON-FIXED-NEXT:    ldr q4, [x8, :lo12:.LCPI2_0]
-; NEON-FIXED-NEXT:    ldr q5, [x9, :lo12:.LCPI2_1]
-; NEON-FIXED-NEXT:    shl v1.4h, v1.4h, #15
-; NEON-FIXED-NEXT:    mov x9, sp
-; NEON-FIXED-NEXT:    str q0, [sp]
-; NEON-FIXED-NEXT:    ushll2 v3.2d, v2.4s, #0
-; NEON-FIXED-NEXT:    ushll v2.2d, v2.2s, #0
-; NEON-FIXED-NEXT:    cmlt v1.4h, v1.4h, #0
-; NEON-FIXED-NEXT:    shl v3.2d, v3.2d, #63
-; NEON-FIXED-NEXT:    shl v2.2d, v2.2d, #63
-; NEON-FIXED-NEXT:    umaxv h1, v1.4h
-; NEON-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
-; NEON-FIXED-NEXT:    cmlt v2.2d, v2.2d, #0
-; NEON-FIXED-NEXT:    and v3.16b, v3.16b, v4.16b
-; NEON-FIXED-NEXT:    and v2.16b, v2.16b, v5.16b
-; NEON-FIXED-NEXT:    cmhi v4.2d, v2.2d, v3.2d
-; NEON-FIXED-NEXT:    bif v2.16b, v3.16b, v4.16b
-; NEON-FIXED-NEXT:    bic v3.16b, v3.16b, v4.16b
-; NEON-FIXED-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; NEON-FIXED-NEXT:    cmhi d4, d3, d2
-; NEON-FIXED-NEXT:    bit v2.8b, v3.8b, v4.8b
-; NEON-FIXED-NEXT:    fmov x8, d2
-; NEON-FIXED-NEXT:    bfi x9, x8, #2, #2
-; NEON-FIXED-NEXT:    ldr w8, [x9]
-; NEON-FIXED-NEXT:    fmov w9, s1
-; NEON-FIXED-NEXT:    tst w9, #0x1
-; NEON-FIXED-NEXT:    csel w0, w8, w0, ne
-; NEON-FIXED-NEXT:    add sp, sp, #16
-; NEON-FIXED-NEXT:    ret
-;
-; SVE-FIXED-LABEL: extract_last_i32:
-; SVE-FIXED:       // %bb.0:
-; SVE-FIXED-NEXT:    sub sp, sp, #16
-; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
-; SVE-FIXED-NEXT:    ushll v2.4s, v1.4h, #0
-; SVE-FIXED-NEXT:    index z4.d, #0, #1
-; SVE-FIXED-NEXT:    shl v1.4h, v1.4h, #15
-; SVE-FIXED-NEXT:    mov x9, sp
-; SVE-FIXED-NEXT:    str q0, [sp]
-; SVE-FIXED-NEXT:    ushll2 v3.2d, v2.4s, #0
-; SVE-FIXED-NEXT:    ushll v2.2d, v2.2s, #0
-; SVE-FIXED-NEXT:    cmlt v1.4h, v1.4h, #0
-; SVE-FIXED-NEXT:    mov z5.d, z4.d
-; SVE-FIXED-NEXT:    shl v3.2d, v3.2d, #63
-; SVE-FIXED-NEXT:    shl v2.2d, v2.2d, #63
-; SVE-FIXED-NEXT:    umaxv h1, v1.4h
-; SVE-FIXED-NEXT:    add z5.d, z5.d, #2 // =0x2
-; SVE-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
-; SVE-FIXED-NEXT:    cmlt v2.2d, v2.2d, #0
-; SVE-FIXED-NEXT:    and v2.16b, v2.16b, v4.16b
-; SVE-FIXED-NEXT:    and v3.16b, v3.16b, v5.16b
-; SVE-FIXED-NEXT:    cmhi v4.2d, v2.2d, v3.2d
-; SVE-FIXED-NEXT:    bif v2.16b, v3.16b, v4.16b
-; SVE-FIXED-NEXT:    bic v3.16b, v3.16b, v4.16b
-; SVE-FIXED-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; SVE-FIXED-NEXT:    cmhi d4, d3, d2
-; SVE-FIXED-NEXT:    bit v2.8b, v3.8b, v4.8b
-; SVE-FIXED-NEXT:    fmov x8, d2
-; SVE-FIXED-NEXT:    bfi x9, x8, #2, #2
-; SVE-FIXED-NEXT:    ldr w8, [x9]
-; SVE-FIXED-NEXT:    fmov w9, s1
-; SVE-FIXED-NEXT:    tst w9, #0x1
-; SVE-FIXED-NEXT:    csel w0, w8, w0, ne
-; SVE-FIXED-NEXT:    add sp, sp, #16
-; SVE-FIXED-NEXT:    ret
-  %res = call i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru)
-  ret i32 %res
-}
-
-define i64 @extract_last_i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru) {
-; CHECK-LABEL: extract_last_i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    fmov d2, xzr
-; CHECK-NEXT:    fmov d4, x8
-; CHECK-NEXT:    shl v1.2s, v1.2s, #31
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    str q0, [sp]
-; CHECK-NEXT:    shl v3.2d, v3.2d, #63
-; CHECK-NEXT:    cmlt v1.2s, v1.2s, #0
-; CHECK-NEXT:    cmlt v3.2d, v3.2d, #0
-; CHECK-NEXT:    umaxp v1.2s, v1.2s, v1.2s
-; CHECK-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    and v3.8b, v3.8b, v4.8b
-; CHECK-NEXT:    cmhi d2, d2, d3
-; CHECK-NEXT:    bic v2.8b, v3.8b, v2.8b
-; CHECK-NEXT:    fmov x8, d2
-; CHECK-NEXT:    orr x8, x9, x8, lsl #3
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    tst w9, #0x1
-; CHECK-NEXT:    csel x0, x8, x0, ne
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
-  %res = call i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru)
-  ret i64 %res
-}
-
-define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) #0 {
-; CHECK-LABEL: extract_last_i8_scalable:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    punpklo p2.h, p0.b
-; CHECK-NEXT:    mov z3.d, #0 // =0x0
-; CHECK-NEXT:    punpkhi p4.h, p0.b
-; CHECK-NEXT:    punpklo p5.h, p2.b
-; CHECK-NEXT:    punpkhi p1.h, p4.b
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    punpkhi p3.h, p2.b
-; CHECK-NEXT:    punpklo p2.h, p4.b
-; CHECK-NEXT:    incd z2.d
-; CHECK-NEXT:    incd z5.d, all, mul #2
-; CHECK-NEXT:    punpklo p4.h, p5.b
-; CHECK-NEXT:    incd z6.d, all, mul #4
-; CHECK-NEXT:    punpkhi p6.h, p1.b
-; CHECK-NEXT:    punpkhi p7.h, p3.b
-; CHECK-NEXT:    sel z1.d, p4, z1.d, z3.d
-; CHECK-NEXT:    mov z4.d, z2.d
-; CHECK-NEXT:    mov z7.d, z2.d
-; CHECK-NEXT:    mov z25.d, z5.d
-; CHECK-NEXT:    punpkhi p5.h, p5.b
-; CHECK-NEXT:    punpkhi p4.h, p2.b
-; CHECK-NEXT:    incd z4.d, all, mul #2
-; CHECK-NEXT:    incd z25.d, all, mul #4
-; CHECK-NEXT:    incd z7.d, all, mul #4
-; CHECK-NEXT:    punpklo p3.h, p3.b
-; CHECK-NEXT:    sel z2.d, p5, z2.d, z3.d
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    punpklo p2.h, p2.b
-; CHECK-NEXT:    mov z24.d, z4.d
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    sel z5.d, p3, z5.d, z3.d
-; CHECK-NEXT:    sel z4.d, p7, z4.d, z3.d
-; CHECK-NEXT:    sel z6.d, p2, z6.d, z3.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z25.d, p1, z25.d, z3.d
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    incd z24.d, all, mul #4
-; CHECK-NEXT:    umax z1.d, p1/m, z1.d, z6.d
-; CHECK-NEXT:    sel z24.d, p6, z24.d, z3.d
-; CHECK-NEXT:    mov z3.d, p4/m, z7.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    umax z4.d, p1/m, z4.d, z24.d
-; CHECK-NEXT:    umax z2.d, p1/m, z2.d, z3.d
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    umax z3.d, p1/m, z3.d, z25.d
-; CHECK-NEXT:    umax z2.d, p1/m, z2.d, z4.d
-; CHECK-NEXT:    umax z1.d, p1/m, z1.d, z3.d
-; CHECK-NEXT:    umax z1.d, p1/m, z1.d, z2.d
-; CHECK-NEXT:    umaxv d1, p1, z1.d
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    whilels p1.b, xzr, x8
-; CHECK-NEXT:    ptest p0, p0.b
-; CHECK-NEXT:    lastb w8, p1, z0.b
-; CHECK-NEXT:    csel w0, w8, w0, ne
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %res = call i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru)
-  ret i8 %res
-}
-
-define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) #0 {
-; CHECK-LABEL: extract_last_i16_scalable:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    mov z5.d, #0 // =0x0
-; CHECK-NEXT:    punpklo p2.h, p0.b
-; CHECK-NEXT:    punpkhi p3.h, p1.b
-; CHECK-NEXT:    punpkhi p4.h, p2.b
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    punpklo p2.h, p2.b
-; CHECK-NEXT:    incd z2.d
-; CHECK-NEXT:    incd z3.d, all, mul #2
-; CHECK-NEXT:    sel z1.d, p2, z1.d, z5.d
-; CHECK-NEXT:    mov z4.d, z2.d
-; CHECK-NEXT:    sel z2.d, p4, z2.d, z5.d
-; CHECK-NEXT:    sel z3.d, p1, z3.d, z5.d
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    incd z4.d, all, mul #2
-; CHECK-NEXT:    umax z1.d, p1/m, z1.d, z3.d
-; CHECK-NEXT:    sel z4.d, p3, z4.d, z5.d
-; CHECK-NEXT:    umax z2.d, p1/m, z2.d, z4.d
-; CHECK-NEXT:    umax z1.d, p1/m, z1.d, z2.d
-; CHECK-NEXT:    umaxv d1, p1, z1.d
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    whilels p1.h, xzr, x8
-; CHECK-NEXT:    lastb w8, p1, z0.h
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    ptest p1, p0.b
-; CHECK-NEXT:    csel w0, w8, w0, ne
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %res = call i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
-  ret i16 %res
-}
-
-define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru) #0 {
-; CHECK-LABEL: extract_last_i32_scalable:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    mov z3.d, #0 // =0x0
-; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    punpklo p2.h, p0.b
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    sel z1.d, p2, z1.d, z3.d
-; CHECK-NEXT:    incd z2.d
-; CHECK-NEXT:    sel z2.d, p1, z2.d, z3.d
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    umax z1.d, p1/m, z1.d, z2.d
-; CHECK-NEXT:    umaxv d1, p1, z1.d
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    whilels p1.s, xzr, x8
-; CHECK-NEXT:    lastb w8, p1, z0.s
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    ptest p1, p0.b
-; CHECK-NEXT:    csel w0, w8, w0, ne
-; CHECK-NEXT:    ret
-  %res = call i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru)
-  ret i32 %res
-}
-
-define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) #0 {
-; CHECK-LABEL: extract_last_i64_scalable:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    mov z2.d, #0 // =0x0
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sel z1.d, p0, z1.d, z2.d
-; CHECK-NEXT:    umaxv d1, p1, z1.d
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    whilels p2.d, xzr, x8
-; CHECK-NEXT:    ptest p1, p0.b
-; CHECK-NEXT:    lastb x8, p2, z0.d
-; CHECK-NEXT:    csel x0, x8, x0, ne
-; CHECK-NEXT:    ret
-  %res = call i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru)
-  ret i64 %res
-}
-
-declare i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
-declare i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
-declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
-declare i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64>, <2 x i1>, i64)
-declare i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
-declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
-declare i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
-declare i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
-
-attributes #0 = { "target-features"="+sve" vscale_range(1, 16) }

>From 49b9a7a916db0ebc8b58f2cd5442443996f80fc4 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 14 Nov 2024 13:44:45 +0000
Subject: [PATCH 3/3] Move lowering code to dedicated function

---
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 81 ++++++++++---------
 .../SelectionDAG/SelectionDAGBuilder.h        |  1 +
 2 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 06755926841ac3..55cbd093ffd68e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6416,6 +6416,50 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
   DAG.setRoot(Histogram);
 }
 
+void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I,
+                                                       unsigned Intrinsic) {
+  assert(Intrinsic == Intrinsic::experimental_vector_extract_last_active &&
+         "Tried lowering invalid vector extract last");
+  SDLoc sdl = getCurSDLoc();
+  SDValue Data = getValue(I.getOperand(0));
+  SDValue Mask = getValue(I.getOperand(1));
+  SDValue PassThru = getValue(I.getOperand(2));
+
+  EVT DataVT = Data.getValueType();
+  EVT ScalarVT = PassThru.getValueType();
+  EVT BoolVT = Mask.getValueType().getScalarType();
+
+  // Find a suitable type for a stepvector.
+  ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
+  if (DataVT.isScalableVector())
+    VScaleRange = getVScaleRange(I.getCaller(), 64);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned EltWidth = TLI.getBitWidthForCttzElements(
+      I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true,
+      &VScaleRange);
+  MVT StepVT = MVT::getIntegerVT(EltWidth);
+  EVT StepVecVT = DataVT.changeVectorElementType(StepVT);
+
+  // Zero out lanes with inactive elements, then find the highest remaining
+  // value from the stepvector.
+  SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT);
+  SDValue StepVec = DAG.getStepVector(sdl, StepVecVT);
+  SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes);
+  SDValue HighestIdx =
+      DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts);
+
+  // Extract the corresponding lane from the data vector
+  EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+  SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT);
+  SDValue Extract =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx);
+
+  // If all mask lanes were inactive, choose the passthru value instead.
+  SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
+  SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru);
+  setValue(&I, Result);
+}
+
 /// Lower the call to the specified intrinsic function.
 void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                                              unsigned Intrinsic) {
@@ -8208,42 +8252,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
   case Intrinsic::experimental_vector_extract_last_active: {
-    SDValue Data = getValue(I.getOperand(0));
-    SDValue Mask = getValue(I.getOperand(1));
-    SDValue PassThru = getValue(I.getOperand(2));
-
-    EVT DataVT = Data.getValueType();
-    EVT ScalarVT = PassThru.getValueType();
-    EVT BoolVT = Mask.getValueType().getScalarType();
-
-    // Find a suitable type for a stepvector.
-    ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
-    if (DataVT.isScalableVector())
-      VScaleRange = getVScaleRange(I.getCaller(), 64);
-    unsigned EltWidth = TLI.getBitWidthForCttzElements(
-        I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true,
-        &VScaleRange);
-    MVT StepVT = MVT::getIntegerVT(EltWidth);
-    EVT StepVecVT = DataVT.changeVectorElementType(StepVT);
-
-    // Zero out lanes with inactive elements, then find the highest remaining
-    // value from the stepvector.
-    SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT);
-    SDValue StepVec = DAG.getStepVector(sdl, StepVecVT);
-    SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes);
-    SDValue HighestIdx =
-        DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts);
-
-    // Extract the corresponding lane from the data vector
-    EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-    SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT);
-    SDValue Extract =
-        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx);
-
-    // If all mask lanes were inactive, choose the passthru value instead.
-    SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
-    SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru);
-    setValue(&I, Result);
+    visitVectorExtractLastActive(I, Intrinsic);
     return;
   }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 3f8a3e7ffb65bb..3a8dc25e98700e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -629,6 +629,7 @@ class SelectionDAGBuilder {
   void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
   void visitConvergenceControl(const CallInst &I, unsigned Intrinsic);
   void visitVectorHistogram(const CallInst &I, unsigned IntrinsicID);
+  void visitVectorExtractLastActive(const CallInst &I, unsigned Intrinsic);
   void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT,
                    const SmallVectorImpl<SDValue> &OpValues);
   void visitVPStore(const VPIntrinsic &VPIntrin,



More information about the llvm-commits mailing list