[llvm] Vector masked extract last active element intrinsic (PR #113587)

via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 24 08:42:47 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-aarch64

Author: Graham Hunter (huntergr-arm)

<details>
<summary>Changes</summary>

As discussed in #<!-- -->112738, it may be better to have an intrinsic to represent vector element extracts based on mask bits. This intrinsic is for the case of extracting the last active element, if any, or a default value if the mask is all-false.

The target-agnostic SelectionDAG lowering is similar to the IR in #<!-- -->106560.

If this intrinsic is acceptable then I can rework the clastb codegen to use it instead.

---

Patch is 32.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113587.diff


4 Files Affected:

- (modified) llvm/docs/LangRef.rst (+30) 
- (modified) llvm/include/llvm/IR/Intrinsics.td (+6) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (+23) 
- (added) llvm/test/CodeGen/AArch64/vector-masked-extract.ll (+663) 


``````````diff
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index b83675c6ed97aa..75d70043598218 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19956,6 +19956,36 @@ the follow sequence of operations:
 
 The ``mask`` operand will apply to at least the gather and scatter operations.
 
+'``llvm.experimental.vector.masked.extract.last.active``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This is an overloaded intrinsic.
+
+This intrinsic will extract the value from a single lane of a vector, based
+on a supplied mask vector.
+
+::
+
+    declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru)
+    declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
+
+Arguments:
+""""""""""
+
+The first argument is the data vector to extract a lane from. The second is a
+mask vector controlling the extraction. The third argument is a passthru
+value.
+
+The two input vectors must have the same number of elements, and the type of
+the passthru value must match that of the elements of the data vector.
+
+Semantics:
+""""""""""
+
+The '``llvm.experimental.vector.masked.extract.last.active``' intrinsic will
+find the index of the most significant active lane in the mask vector, and
+extract the element at that index in the corresponding data vector. If no mask
+lanes are active then the passthru value is returned instead.
 
 .. _int_vector_compress:
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 94e53f372127da..557140cf1a62bf 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1918,6 +1918,12 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
                              [ IntrArgMemOnly ]>;
 
+// Extract based on mask bits
+def int_experimental_vector_masked_extract_last_active:
+    DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+              [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+               LLVMVectorElementType<0>], [IntrNoMem]>;
+
 // Operators
 let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
   // Integer arithmetic
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 8450553743074c..f80342ff8e2f40 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8187,6 +8187,29 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     visitVectorHistogram(I, Intrinsic);
     return;
   }
+  case Intrinsic::experimental_vector_masked_extract_last_active: {
+    SDValue Data = getValue(I.getOperand(0));
+    SDValue Mask = getValue(I.getOperand(1));
+    SDValue PassThru = getValue(I.getOperand(2));
+
+    EVT DataVT = Data.getValueType();
+    EVT ScalarVT = PassThru.getValueType();
+    EVT BoolVT = Mask.getValueType().getScalarType();
+    EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+    EVT IdxVecVT = DataVT.changeVectorElementType(IdxVT);
+
+    SDValue Zeroes = DAG.getConstant(0, sdl, IdxVecVT);
+    SDValue StepVec = DAG.getStepVector(sdl, IdxVecVT);
+    SDValue ActiveElts = DAG.getSelect(sdl, IdxVecVT, Mask, StepVec, Zeroes);
+    SDValue HighestIdx =
+        DAG.getNode(ISD::VECREDUCE_UMAX, sdl, IdxVT, ActiveElts);
+    SDValue Extract =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, HighestIdx);
+    SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
+    SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru);
+    setValue(&I, Result);
+    return;
+  }
   }
 }
 
diff --git a/llvm/test/CodeGen/AArch64/vector-masked-extract.ll b/llvm/test/CodeGen/AArch64/vector-masked-extract.ll
new file mode 100644
index 00000000000000..04adf4e476b041
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-masked-extract.ll
@@ -0,0 +1,663 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,NEON-FIXED
+; RUN: llc -mtriple=aarch64 -mattr=+sve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SVE-FIXED
+
+define i8 @extract_last_i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i8:
+; NEON-FIXED:       // %bb.0:
+; NEON-FIXED-NEXT:    sub sp, sp, #16
+; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT:    umov w15, v1.b[14]
+; NEON-FIXED-NEXT:    umov w14, v1.b[6]
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_0
+; NEON-FIXED-NEXT:    umov w12, v1.b[15]
+; NEON-FIXED-NEXT:    umov w13, v1.b[10]
+; NEON-FIXED-NEXT:    ldr q2, [x8, :lo12:.LCPI0_0]
+; NEON-FIXED-NEXT:    umov w11, v1.b[2]
+; NEON-FIXED-NEXT:    umov w8, v1.b[7]
+; NEON-FIXED-NEXT:    str q0, [sp]
+; NEON-FIXED-NEXT:    umov w9, v1.b[11]
+; NEON-FIXED-NEXT:    umov w10, v1.b[3]
+; NEON-FIXED-NEXT:    umov w16, v1.b[12]
+; NEON-FIXED-NEXT:    fmov s3, w15
+; NEON-FIXED-NEXT:    umov w15, v1.b[4]
+; NEON-FIXED-NEXT:    fmov s4, w14
+; NEON-FIXED-NEXT:    fmov s5, w13
+; NEON-FIXED-NEXT:    umov w13, v1.b[0]
+; NEON-FIXED-NEXT:    umov w14, v1.b[13]
+; NEON-FIXED-NEXT:    fmov s6, w11
+; NEON-FIXED-NEXT:    umov w11, v1.b[5]
+; NEON-FIXED-NEXT:    mov v3.s[1], w12
+; NEON-FIXED-NEXT:    umov w12, v1.b[8]
+; NEON-FIXED-NEXT:    mov v4.s[1], w8
+; NEON-FIXED-NEXT:    umov w8, v1.b[9]
+; NEON-FIXED-NEXT:    mov v5.s[1], w9
+; NEON-FIXED-NEXT:    umov w9, v1.b[1]
+; NEON-FIXED-NEXT:    fmov s7, w16
+; NEON-FIXED-NEXT:    fmov s16, w15
+; NEON-FIXED-NEXT:    mov v6.s[1], w10
+; NEON-FIXED-NEXT:    fmov s18, w13
+; NEON-FIXED-NEXT:    shl v1.16b, v1.16b, #7
+; NEON-FIXED-NEXT:    fmov s17, w12
+; NEON-FIXED-NEXT:    ushll v3.2d, v3.2s, #0
+; NEON-FIXED-NEXT:    ushll v4.2d, v4.2s, #0
+; NEON-FIXED-NEXT:    mov v7.s[1], w14
+; NEON-FIXED-NEXT:    mov v16.s[1], w11
+; NEON-FIXED-NEXT:    ushll v5.2d, v5.2s, #0
+; NEON-FIXED-NEXT:    mov v18.s[1], w9
+; NEON-FIXED-NEXT:    adrp x9, .LCPI0_2
+; NEON-FIXED-NEXT:    ushll v6.2d, v6.2s, #0
+; NEON-FIXED-NEXT:    ldr q20, [x9, :lo12:.LCPI0_2]
+; NEON-FIXED-NEXT:    adrp x9, .LCPI0_7
+; NEON-FIXED-NEXT:    mov v17.s[1], w8
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_1
+; NEON-FIXED-NEXT:    ldr q23, [x9, :lo12:.LCPI0_7]
+; NEON-FIXED-NEXT:    mov x9, sp
+; NEON-FIXED-NEXT:    ldr q19, [x8, :lo12:.LCPI0_1]
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_3
+; NEON-FIXED-NEXT:    shl v3.2d, v3.2d, #63
+; NEON-FIXED-NEXT:    shl v4.2d, v4.2d, #63
+; NEON-FIXED-NEXT:    ushll v7.2d, v7.2s, #0
+; NEON-FIXED-NEXT:    shl v5.2d, v5.2d, #63
+; NEON-FIXED-NEXT:    ushll v16.2d, v16.2s, #0
+; NEON-FIXED-NEXT:    ushll v17.2d, v17.2s, #0
+; NEON-FIXED-NEXT:    shl v6.2d, v6.2d, #63
+; NEON-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT:    ushll v18.2d, v18.2s, #0
+; NEON-FIXED-NEXT:    cmlt v1.16b, v1.16b, #0
+; NEON-FIXED-NEXT:    cmlt v4.2d, v4.2d, #0
+; NEON-FIXED-NEXT:    cmlt v5.2d, v5.2d, #0
+; NEON-FIXED-NEXT:    cmlt v6.2d, v6.2d, #0
+; NEON-FIXED-NEXT:    and v2.16b, v3.16b, v2.16b
+; NEON-FIXED-NEXT:    shl v3.2d, v7.2d, #63
+; NEON-FIXED-NEXT:    shl v7.2d, v16.2d, #63
+; NEON-FIXED-NEXT:    shl v16.2d, v17.2d, #63
+; NEON-FIXED-NEXT:    ldr q17, [x8, :lo12:.LCPI0_3]
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_4
+; NEON-FIXED-NEXT:    ldr q21, [x8, :lo12:.LCPI0_4]
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_5
+; NEON-FIXED-NEXT:    shl v18.2d, v18.2d, #63
+; NEON-FIXED-NEXT:    ldr q22, [x8, :lo12:.LCPI0_5]
+; NEON-FIXED-NEXT:    adrp x8, .LCPI0_6
+; NEON-FIXED-NEXT:    and v4.16b, v4.16b, v19.16b
+; NEON-FIXED-NEXT:    ldr q19, [x8, :lo12:.LCPI0_6]
+; NEON-FIXED-NEXT:    cmlt v16.2d, v16.2d, #0
+; NEON-FIXED-NEXT:    and v5.16b, v5.16b, v20.16b
+; NEON-FIXED-NEXT:    cmlt v18.2d, v18.2d, #0
+; NEON-FIXED-NEXT:    and v6.16b, v6.16b, v17.16b
+; NEON-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT:    cmlt v7.2d, v7.2d, #0
+; NEON-FIXED-NEXT:    umaxv b1, v1.16b
+; NEON-FIXED-NEXT:    and v16.16b, v16.16b, v19.16b
+; NEON-FIXED-NEXT:    and v17.16b, v18.16b, v23.16b
+; NEON-FIXED-NEXT:    cmhi v18.2d, v4.2d, v2.2d
+; NEON-FIXED-NEXT:    cmhi v19.2d, v6.2d, v5.2d
+; NEON-FIXED-NEXT:    and v3.16b, v3.16b, v21.16b
+; NEON-FIXED-NEXT:    and v7.16b, v7.16b, v22.16b
+; NEON-FIXED-NEXT:    cmhi v21.2d, v17.2d, v16.2d
+; NEON-FIXED-NEXT:    bit v2.16b, v4.16b, v18.16b
+; NEON-FIXED-NEXT:    mov v4.16b, v19.16b
+; NEON-FIXED-NEXT:    cmhi v20.2d, v7.2d, v3.2d
+; NEON-FIXED-NEXT:    bsl v4.16b, v6.16b, v5.16b
+; NEON-FIXED-NEXT:    mov v5.16b, v21.16b
+; NEON-FIXED-NEXT:    bit v3.16b, v7.16b, v20.16b
+; NEON-FIXED-NEXT:    bsl v5.16b, v17.16b, v16.16b
+; NEON-FIXED-NEXT:    cmhi v6.2d, v4.2d, v2.2d
+; NEON-FIXED-NEXT:    cmhi v7.2d, v5.2d, v3.2d
+; NEON-FIXED-NEXT:    bit v2.16b, v4.16b, v6.16b
+; NEON-FIXED-NEXT:    bit v3.16b, v5.16b, v7.16b
+; NEON-FIXED-NEXT:    cmhi v4.2d, v3.2d, v2.2d
+; NEON-FIXED-NEXT:    bit v2.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; NEON-FIXED-NEXT:    cmhi d4, d2, d3
+; NEON-FIXED-NEXT:    bif v2.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT:    fmov x8, d2
+; NEON-FIXED-NEXT:    bfxil x9, x8, #0, #4
+; NEON-FIXED-NEXT:    ldrb w8, [x9]
+; NEON-FIXED-NEXT:    fmov w9, s1
+; NEON-FIXED-NEXT:    tst w9, #0x1
+; NEON-FIXED-NEXT:    csel w0, w8, w0, ne
+; NEON-FIXED-NEXT:    add sp, sp, #16
+; NEON-FIXED-NEXT:    ret
+;
+; SVE-FIXED-LABEL: extract_last_i8:
+; SVE-FIXED:       // %bb.0:
+; SVE-FIXED-NEXT:    sub sp, sp, #16
+; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT:    umov w8, v1.b[14]
+; SVE-FIXED-NEXT:    umov w9, v1.b[6]
+; SVE-FIXED-NEXT:    index z2.d, #0, #1
+; SVE-FIXED-NEXT:    umov w12, v1.b[2]
+; SVE-FIXED-NEXT:    umov w10, v1.b[10]
+; SVE-FIXED-NEXT:    str q0, [sp]
+; SVE-FIXED-NEXT:    umov w13, v1.b[12]
+; SVE-FIXED-NEXT:    umov w11, v1.b[15]
+; SVE-FIXED-NEXT:    umov w14, v1.b[4]
+; SVE-FIXED-NEXT:    umov w16, v1.b[0]
+; SVE-FIXED-NEXT:    umov w15, v1.b[8]
+; SVE-FIXED-NEXT:    fmov s3, w8
+; SVE-FIXED-NEXT:    umov w8, v1.b[7]
+; SVE-FIXED-NEXT:    fmov s4, w9
+; SVE-FIXED-NEXT:    umov w9, v1.b[11]
+; SVE-FIXED-NEXT:    fmov s6, w12
+; SVE-FIXED-NEXT:    umov w12, v1.b[3]
+; SVE-FIXED-NEXT:    fmov s5, w10
+; SVE-FIXED-NEXT:    umov w10, v1.b[1]
+; SVE-FIXED-NEXT:    fmov s7, w13
+; SVE-FIXED-NEXT:    umov w13, v1.b[13]
+; SVE-FIXED-NEXT:    fmov s16, w14
+; SVE-FIXED-NEXT:    fmov s18, w16
+; SVE-FIXED-NEXT:    mov v4.s[1], w8
+; SVE-FIXED-NEXT:    umov w8, v1.b[5]
+; SVE-FIXED-NEXT:    mov v3.s[1], w11
+; SVE-FIXED-NEXT:    mov v5.s[1], w9
+; SVE-FIXED-NEXT:    mov v6.s[1], w12
+; SVE-FIXED-NEXT:    umov w9, v1.b[9]
+; SVE-FIXED-NEXT:    fmov s17, w15
+; SVE-FIXED-NEXT:    mov v18.s[1], w10
+; SVE-FIXED-NEXT:    mov z19.d, z2.d
+; SVE-FIXED-NEXT:    mov v7.s[1], w13
+; SVE-FIXED-NEXT:    mov z20.d, z2.d
+; SVE-FIXED-NEXT:    mov z21.d, z2.d
+; SVE-FIXED-NEXT:    mov v16.s[1], w8
+; SVE-FIXED-NEXT:    ushll v3.2d, v3.2s, #0
+; SVE-FIXED-NEXT:    ushll v4.2d, v4.2s, #0
+; SVE-FIXED-NEXT:    ushll v5.2d, v5.2s, #0
+; SVE-FIXED-NEXT:    ushll v6.2d, v6.2s, #0
+; SVE-FIXED-NEXT:    mov v17.s[1], w9
+; SVE-FIXED-NEXT:    mov x9, sp
+; SVE-FIXED-NEXT:    ushll v18.2d, v18.2s, #0
+; SVE-FIXED-NEXT:    mov z25.d, z2.d
+; SVE-FIXED-NEXT:    ushll v7.2d, v7.2s, #0
+; SVE-FIXED-NEXT:    shl v3.2d, v3.2d, #63
+; SVE-FIXED-NEXT:    shl v4.2d, v4.2d, #63
+; SVE-FIXED-NEXT:    ushll v16.2d, v16.2s, #0
+; SVE-FIXED-NEXT:    shl v5.2d, v5.2d, #63
+; SVE-FIXED-NEXT:    shl v6.2d, v6.2d, #63
+; SVE-FIXED-NEXT:    mov z22.d, z2.d
+; SVE-FIXED-NEXT:    mov z23.d, z2.d
+; SVE-FIXED-NEXT:    add z19.d, z19.d, #6 // =0x6
+; SVE-FIXED-NEXT:    shl v18.2d, v18.2d, #63
+; SVE-FIXED-NEXT:    ushll v17.2d, v17.2s, #0
+; SVE-FIXED-NEXT:    shl v7.2d, v7.2d, #63
+; SVE-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
+; SVE-FIXED-NEXT:    cmlt v4.2d, v4.2d, #0
+; SVE-FIXED-NEXT:    add z25.d, z25.d, #14 // =0xe
+; SVE-FIXED-NEXT:    shl v16.2d, v16.2d, #63
+; SVE-FIXED-NEXT:    cmlt v5.2d, v5.2d, #0
+; SVE-FIXED-NEXT:    add z20.d, z20.d, #10 // =0xa
+; SVE-FIXED-NEXT:    cmlt v6.2d, v6.2d, #0
+; SVE-FIXED-NEXT:    add z21.d, z21.d, #2 // =0x2
+; SVE-FIXED-NEXT:    mov z24.d, z2.d
+; SVE-FIXED-NEXT:    shl v17.2d, v17.2d, #63
+; SVE-FIXED-NEXT:    cmlt v18.2d, v18.2d, #0
+; SVE-FIXED-NEXT:    cmlt v7.2d, v7.2d, #0
+; SVE-FIXED-NEXT:    add z22.d, z22.d, #12 // =0xc
+; SVE-FIXED-NEXT:    cmlt v16.2d, v16.2d, #0
+; SVE-FIXED-NEXT:    add z23.d, z23.d, #4 // =0x4
+; SVE-FIXED-NEXT:    and v3.16b, v3.16b, v25.16b
+; SVE-FIXED-NEXT:    and v4.16b, v4.16b, v19.16b
+; SVE-FIXED-NEXT:    and v5.16b, v5.16b, v20.16b
+; SVE-FIXED-NEXT:    and v6.16b, v6.16b, v21.16b
+; SVE-FIXED-NEXT:    cmlt v17.2d, v17.2d, #0
+; SVE-FIXED-NEXT:    add z24.d, z24.d, #8 // =0x8
+; SVE-FIXED-NEXT:    and v2.16b, v18.16b, v2.16b
+; SVE-FIXED-NEXT:    and v7.16b, v7.16b, v22.16b
+; SVE-FIXED-NEXT:    and v16.16b, v16.16b, v23.16b
+; SVE-FIXED-NEXT:    cmhi v18.2d, v4.2d, v3.2d
+; SVE-FIXED-NEXT:    shl v1.16b, v1.16b, #7
+; SVE-FIXED-NEXT:    cmhi v19.2d, v6.2d, v5.2d
+; SVE-FIXED-NEXT:    and v17.16b, v17.16b, v24.16b
+; SVE-FIXED-NEXT:    cmhi v20.2d, v16.2d, v7.2d
+; SVE-FIXED-NEXT:    bit v3.16b, v4.16b, v18.16b
+; SVE-FIXED-NEXT:    cmlt v1.16b, v1.16b, #0
+; SVE-FIXED-NEXT:    mov v4.16b, v19.16b
+; SVE-FIXED-NEXT:    cmhi v21.2d, v2.2d, v17.2d
+; SVE-FIXED-NEXT:    umaxv b1, v1.16b
+; SVE-FIXED-NEXT:    bsl v4.16b, v6.16b, v5.16b
+; SVE-FIXED-NEXT:    mov v5.16b, v20.16b
+; SVE-FIXED-NEXT:    bif v2.16b, v17.16b, v21.16b
+; SVE-FIXED-NEXT:    bsl v5.16b, v16.16b, v7.16b
+; SVE-FIXED-NEXT:    cmhi v6.2d, v4.2d, v3.2d
+; SVE-FIXED-NEXT:    cmhi v7.2d, v2.2d, v5.2d
+; SVE-FIXED-NEXT:    bit v3.16b, v4.16b, v6.16b
+; SVE-FIXED-NEXT:    bif v2.16b, v5.16b, v7.16b
+; SVE-FIXED-NEXT:    cmhi v4.2d, v2.2d, v3.2d
+; SVE-FIXED-NEXT:    bif v2.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT:    cmhi d4, d2, d3
+; SVE-FIXED-NEXT:    bif v2.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT:    fmov x8, d2
+; SVE-FIXED-NEXT:    bfxil x9, x8, #0, #4
+; SVE-FIXED-NEXT:    ldrb w8, [x9]
+; SVE-FIXED-NEXT:    fmov w9, s1
+; SVE-FIXED-NEXT:    tst w9, #0x1
+; SVE-FIXED-NEXT:    csel w0, w8, w0, ne
+; SVE-FIXED-NEXT:    add sp, sp, #16
+; SVE-FIXED-NEXT:    ret
+  %res = call i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru)
+  ret i8 %res
+}
+
+define i16 @extract_last_i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i16:
+; NEON-FIXED:       // %bb.0:
+; NEON-FIXED-NEXT:    sub sp, sp, #16
+; NEON-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NEON-FIXED-NEXT:    umov w8, v1.b[6]
+; NEON-FIXED-NEXT:    umov w9, v1.b[2]
+; NEON-FIXED-NEXT:    str q0, [sp]
+; NEON-FIXED-NEXT:    umov w11, v1.b[4]
+; NEON-FIXED-NEXT:    umov w12, v1.b[0]
+; NEON-FIXED-NEXT:    umov w10, v1.b[7]
+; NEON-FIXED-NEXT:    umov w13, v1.b[3]
+; NEON-FIXED-NEXT:    umov w14, v1.b[5]
+; NEON-FIXED-NEXT:    umov w15, v1.b[1]
+; NEON-FIXED-NEXT:    shl v1.8b, v1.8b, #7
+; NEON-FIXED-NEXT:    fmov s2, w8
+; NEON-FIXED-NEXT:    adrp x8, .LCPI1_0
+; NEON-FIXED-NEXT:    fmov s3, w9
+; NEON-FIXED-NEXT:    fmov s4, w11
+; NEON-FIXED-NEXT:    adrp x9, .LCPI1_1
+; NEON-FIXED-NEXT:    ldr q6, [x8, :lo12:.LCPI1_0]
+; NEON-FIXED-NEXT:    fmov s5, w12
+; NEON-FIXED-NEXT:    adrp x8, .LCPI1_3
+; NEON-FIXED-NEXT:    ldr q7, [x9, :lo12:.LCPI1_1]
+; NEON-FIXED-NEXT:    mov v2.s[1], w10
+; NEON-FIXED-NEXT:    mov v3.s[1], w13
+; NEON-FIXED-NEXT:    adrp x10, .LCPI1_2
+; NEON-FIXED-NEXT:    mov v4.s[1], w14
+; NEON-FIXED-NEXT:    ldr q16, [x10, :lo12:.LCPI1_2]
+; NEON-FIXED-NEXT:    ldr q17, [x8, :lo12:.LCPI1_3]
+; NEON-FIXED-NEXT:    mov v5.s[1], w15
+; NEON-FIXED-NEXT:    cmlt v1.8b, v1.8b, #0
+; NEON-FIXED-NEXT:    mov x9, sp
+; NEON-FIXED-NEXT:    ushll v2.2d, v2.2s, #0
+; NEON-FIXED-NEXT:    ushll v3.2d, v3.2s, #0
+; NEON-FIXED-NEXT:    ushll v4.2d, v4.2s, #0
+; NEON-FIXED-NEXT:    umaxv b1, v1.8b
+; NEON-FIXED-NEXT:    ushll v5.2d, v5.2s, #0
+; NEON-FIXED-NEXT:    shl v2.2d, v2.2d, #63
+; NEON-FIXED-NEXT:    shl v3.2d, v3.2d, #63
+; NEON-FIXED-NEXT:    shl v4.2d, v4.2d, #63
+; NEON-FIXED-NEXT:    shl v5.2d, v5.2d, #63
+; NEON-FIXED-NEXT:    cmlt v2.2d, v2.2d, #0
+; NEON-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT:    cmlt v4.2d, v4.2d, #0
+; NEON-FIXED-NEXT:    cmlt v5.2d, v5.2d, #0
+; NEON-FIXED-NEXT:    and v2.16b, v2.16b, v6.16b
+; NEON-FIXED-NEXT:    and v3.16b, v3.16b, v7.16b
+; NEON-FIXED-NEXT:    and v4.16b, v4.16b, v16.16b
+; NEON-FIXED-NEXT:    and v5.16b, v5.16b, v17.16b
+; NEON-FIXED-NEXT:    cmhi v6.2d, v3.2d, v2.2d
+; NEON-FIXED-NEXT:    cmhi v7.2d, v5.2d, v4.2d
+; NEON-FIXED-NEXT:    bit v2.16b, v3.16b, v6.16b
+; NEON-FIXED-NEXT:    mov v3.16b, v7.16b
+; NEON-FIXED-NEXT:    bsl v3.16b, v5.16b, v4.16b
+; NEON-FIXED-NEXT:    cmhi v4.2d, v3.2d, v2.2d
+; NEON-FIXED-NEXT:    bit v2.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; NEON-FIXED-NEXT:    cmhi d4, d2, d3
+; NEON-FIXED-NEXT:    bif v2.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT:    fmov x8, d2
+; NEON-FIXED-NEXT:    bfi x9, x8, #1, #3
+; NEON-FIXED-NEXT:    ldrh w8, [x9]
+; NEON-FIXED-NEXT:    fmov w9, s1
+; NEON-FIXED-NEXT:    tst w9, #0x1
+; NEON-FIXED-NEXT:    csel w0, w8, w0, ne
+; NEON-FIXED-NEXT:    add sp, sp, #16
+; NEON-FIXED-NEXT:    ret
+;
+; SVE-FIXED-LABEL: extract_last_i16:
+; SVE-FIXED:       // %bb.0:
+; SVE-FIXED-NEXT:    sub sp, sp, #16
+; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT:    // kill: def $d1 killed $d1 def $q1
+; SVE-FIXED-NEXT:    umov w8, v1.b[0]
+; SVE-FIXED-NEXT:    umov w10, v1.b[6]
+; SVE-FIXED-NEXT:    index z6.d, #0, #1
+; SVE-FIXED-NEXT:    umov w11, v1.b[2]
+; SVE-FIXED-NEXT:    umov w14, v1.b[4]
+; SVE-FIXED-NEXT:    str q0, [sp]
+; SVE-FIXED-NEXT:    umov w9, v1.b[1]
+; SVE-FIXED-NEXT:    umov w12, v1.b[7]
+; SVE-FIXED-NEXT:    umov w13, v1.b[3]
+; SVE-FIXED-NEXT:    fmov s2, w8
+; SVE-FIXED-NEXT:    umov w8, v1.b[5]
+; SVE-FIXED-NEXT:    fmov s3, w10
+; SVE-FIXED-NEXT:    fmov s4, w11
+; SVE-FIXED-NEXT:    fmov s5, w14
+; SVE-FIXED-NEXT:    mov z7.d, z6.d
+; SVE-FIXED-NEXT:    mov z16.d, z6.d
+; SVE-FIXED-NEXT:    mov z17.d, z6.d
+; SVE-FIXED-NEXT:    shl v1.8b, v1.8b, #7
+; SVE-FIXED-NEXT:    mov v2.s[1], w9
+; SVE-FIXED-NEXT:    mov x9, sp
+; SVE-FIXED-NEXT:    mov v3.s[1], w12
+; SVE-FIXED-NEXT:    mov v4.s[1], w13
+; SVE-FIXED-NEXT:    mov v5.s[1], w8
+; SVE-FIXED-NEXT:    add z7.d, z7.d, #2 // =0x2
+; SVE-FIXED-NEXT:    add z17.d, z17.d, #6 // =0x6
+; SVE-FIXED-NEXT:    add z16.d, z16.d, #4 // =0x4
+; SVE-FIXED-NEXT:    cmlt v1.8b, v1.8b, #0
+; SVE-FIXED-NEXT:    ushll v2.2d, v2.2s, #0
+; SVE-FIXED-NEXT:    ushll v3.2d, v3.2s, #0
+; SVE-FIXED-NEXT:    ushll v4.2d, v4.2s, #0
+; SVE-FIXED-NEXT:    ushll v5.2d, v5.2s, #0
+; SVE-FIXED-NEXT:    umaxv b1, v1.8b
+; SVE-FIXED-NEXT:    shl v2.2d, v2.2d, #63
+; SVE-FIXED-NEXT:    shl v3.2d, v3.2d, #63
+; SVE-FIXED-NEXT:    shl v4.2d, v4.2d, #63
+; SVE-FIXED-NEXT:    shl v5.2d, v5.2d, #63
+; SVE-FIXED-NEXT:    cmlt v2.2d, v2.2d, #0
+; SVE-FIXED-NEXT:    cmlt v3.2d, v3.2d, #0
+; SVE-FIXED-NEXT:    cmlt v4.2d, v4.2d, #0
+; SVE-FIXED-NEXT:    cmlt v5.2d, v5.2d, #0
+; SVE-FIXED-NEXT:    and v2.16b, v2.16b, v6.16b
+; SVE-FIXED-NEXT:    and v3.16b, v3.16b, v17.16b
+; SVE-FIXED-NEXT:    and v4.16b, v4.16b, v7.16b
+; SVE-FIXED-NEXT:    and v5.16b, v5.16b, v16.16b
+; SVE-FIXED-NEXT:    cmhi v6.2d, v4.2d, v3.2d
+; SVE-FIXED-NEXT:    cmhi v7.2d, v2.2d, v5.2d
+; SVE-FIXED-NEXT:    bit v3.16b, v4.16b, v6.16b
+; SVE-FIXED-NEXT:    bif v2.16b, v5.16b, v7.16b
+; SVE-FIXED-NEXT:    cmhi v4.2d, v2.2d, v3.2d
+; SVE-FIXED-NEXT:    bif v2.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT:    cmhi d4, d2, d3
+; SVE-FIXED-NEXT:    bif v2.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT:    fmov x8, d2
+; SVE-FIXED-NEXT:    bfi x9, x8, #1, #3
+; SVE-FIXED-NEXT:    ldrh w8, [x9]
+; SVE-FIXED-NEXT:    fmov w9, s1
+; SVE-FIXED-NEXT:    tst w9, #0x1
+; SVE-FIXED-NEXT:    csel w0, w8, w0, ne
+; SVE-FIXED-NEXT:    add...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/113587


More information about the llvm-commits mailing list