[llvm] Vector masked extract last active element intrinsic (PR #113587)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 7 05:42:42 PST 2024
================
@@ -0,0 +1,663 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,NEON-FIXED
+; RUN: llc -mtriple=aarch64 -mattr=+sve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SVE-FIXED
+
+define i8 @extract_last_i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i8:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: umov w15, v1.b[14]
+; NEON-FIXED-NEXT: umov w14, v1.b[6]
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_0
+; NEON-FIXED-NEXT: umov w12, v1.b[15]
+; NEON-FIXED-NEXT: umov w13, v1.b[10]
+; NEON-FIXED-NEXT: ldr q2, [x8, :lo12:.LCPI0_0]
+; NEON-FIXED-NEXT: umov w11, v1.b[2]
+; NEON-FIXED-NEXT: umov w8, v1.b[7]
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: umov w9, v1.b[11]
+; NEON-FIXED-NEXT: umov w10, v1.b[3]
+; NEON-FIXED-NEXT: umov w16, v1.b[12]
+; NEON-FIXED-NEXT: fmov s3, w15
+; NEON-FIXED-NEXT: umov w15, v1.b[4]
+; NEON-FIXED-NEXT: fmov s4, w14
+; NEON-FIXED-NEXT: fmov s5, w13
+; NEON-FIXED-NEXT: umov w13, v1.b[0]
+; NEON-FIXED-NEXT: umov w14, v1.b[13]
+; NEON-FIXED-NEXT: fmov s6, w11
+; NEON-FIXED-NEXT: umov w11, v1.b[5]
+; NEON-FIXED-NEXT: mov v3.s[1], w12
+; NEON-FIXED-NEXT: umov w12, v1.b[8]
+; NEON-FIXED-NEXT: mov v4.s[1], w8
+; NEON-FIXED-NEXT: umov w8, v1.b[9]
+; NEON-FIXED-NEXT: mov v5.s[1], w9
+; NEON-FIXED-NEXT: umov w9, v1.b[1]
+; NEON-FIXED-NEXT: fmov s7, w16
+; NEON-FIXED-NEXT: fmov s16, w15
+; NEON-FIXED-NEXT: mov v6.s[1], w10
+; NEON-FIXED-NEXT: fmov s18, w13
+; NEON-FIXED-NEXT: shl v1.16b, v1.16b, #7
+; NEON-FIXED-NEXT: fmov s17, w12
+; NEON-FIXED-NEXT: ushll v3.2d, v3.2s, #0
+; NEON-FIXED-NEXT: ushll v4.2d, v4.2s, #0
+; NEON-FIXED-NEXT: mov v7.s[1], w14
+; NEON-FIXED-NEXT: mov v16.s[1], w11
+; NEON-FIXED-NEXT: ushll v5.2d, v5.2s, #0
+; NEON-FIXED-NEXT: mov v18.s[1], w9
+; NEON-FIXED-NEXT: adrp x9, .LCPI0_2
+; NEON-FIXED-NEXT: ushll v6.2d, v6.2s, #0
+; NEON-FIXED-NEXT: ldr q20, [x9, :lo12:.LCPI0_2]
+; NEON-FIXED-NEXT: adrp x9, .LCPI0_7
+; NEON-FIXED-NEXT: mov v17.s[1], w8
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_1
+; NEON-FIXED-NEXT: ldr q23, [x9, :lo12:.LCPI0_7]
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr q19, [x8, :lo12:.LCPI0_1]
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_3
+; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63
+; NEON-FIXED-NEXT: shl v4.2d, v4.2d, #63
+; NEON-FIXED-NEXT: ushll v7.2d, v7.2s, #0
+; NEON-FIXED-NEXT: shl v5.2d, v5.2d, #63
+; NEON-FIXED-NEXT: ushll v16.2d, v16.2s, #0
+; NEON-FIXED-NEXT: ushll v17.2d, v17.2s, #0
+; NEON-FIXED-NEXT: shl v6.2d, v6.2d, #63
+; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT: ushll v18.2d, v18.2s, #0
+; NEON-FIXED-NEXT: cmlt v1.16b, v1.16b, #0
+; NEON-FIXED-NEXT: cmlt v4.2d, v4.2d, #0
+; NEON-FIXED-NEXT: cmlt v5.2d, v5.2d, #0
+; NEON-FIXED-NEXT: cmlt v6.2d, v6.2d, #0
+; NEON-FIXED-NEXT: and v2.16b, v3.16b, v2.16b
+; NEON-FIXED-NEXT: shl v3.2d, v7.2d, #63
+; NEON-FIXED-NEXT: shl v7.2d, v16.2d, #63
+; NEON-FIXED-NEXT: shl v16.2d, v17.2d, #63
+; NEON-FIXED-NEXT: ldr q17, [x8, :lo12:.LCPI0_3]
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_4
+; NEON-FIXED-NEXT: ldr q21, [x8, :lo12:.LCPI0_4]
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_5
+; NEON-FIXED-NEXT: shl v18.2d, v18.2d, #63
+; NEON-FIXED-NEXT: ldr q22, [x8, :lo12:.LCPI0_5]
+; NEON-FIXED-NEXT: adrp x8, .LCPI0_6
+; NEON-FIXED-NEXT: and v4.16b, v4.16b, v19.16b
+; NEON-FIXED-NEXT: ldr q19, [x8, :lo12:.LCPI0_6]
+; NEON-FIXED-NEXT: cmlt v16.2d, v16.2d, #0
+; NEON-FIXED-NEXT: and v5.16b, v5.16b, v20.16b
+; NEON-FIXED-NEXT: cmlt v18.2d, v18.2d, #0
+; NEON-FIXED-NEXT: and v6.16b, v6.16b, v17.16b
+; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT: cmlt v7.2d, v7.2d, #0
+; NEON-FIXED-NEXT: umaxv b1, v1.16b
+; NEON-FIXED-NEXT: and v16.16b, v16.16b, v19.16b
+; NEON-FIXED-NEXT: and v17.16b, v18.16b, v23.16b
+; NEON-FIXED-NEXT: cmhi v18.2d, v4.2d, v2.2d
+; NEON-FIXED-NEXT: cmhi v19.2d, v6.2d, v5.2d
+; NEON-FIXED-NEXT: and v3.16b, v3.16b, v21.16b
+; NEON-FIXED-NEXT: and v7.16b, v7.16b, v22.16b
+; NEON-FIXED-NEXT: cmhi v21.2d, v17.2d, v16.2d
+; NEON-FIXED-NEXT: bit v2.16b, v4.16b, v18.16b
+; NEON-FIXED-NEXT: mov v4.16b, v19.16b
+; NEON-FIXED-NEXT: cmhi v20.2d, v7.2d, v3.2d
+; NEON-FIXED-NEXT: bsl v4.16b, v6.16b, v5.16b
+; NEON-FIXED-NEXT: mov v5.16b, v21.16b
+; NEON-FIXED-NEXT: bit v3.16b, v7.16b, v20.16b
+; NEON-FIXED-NEXT: bsl v5.16b, v17.16b, v16.16b
+; NEON-FIXED-NEXT: cmhi v6.2d, v4.2d, v2.2d
+; NEON-FIXED-NEXT: cmhi v7.2d, v5.2d, v3.2d
+; NEON-FIXED-NEXT: bit v2.16b, v4.16b, v6.16b
+; NEON-FIXED-NEXT: bit v3.16b, v5.16b, v7.16b
+; NEON-FIXED-NEXT: cmhi v4.2d, v3.2d, v2.2d
+; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; NEON-FIXED-NEXT: cmhi d4, d2, d3
+; NEON-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT: fmov x8, d2
+; NEON-FIXED-NEXT: bfxil x9, x8, #0, #4
+; NEON-FIXED-NEXT: ldrb w8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_i8:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: umov w8, v1.b[14]
+; SVE-FIXED-NEXT: umov w9, v1.b[6]
+; SVE-FIXED-NEXT: index z2.d, #0, #1
+; SVE-FIXED-NEXT: umov w12, v1.b[2]
+; SVE-FIXED-NEXT: umov w10, v1.b[10]
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: umov w13, v1.b[12]
+; SVE-FIXED-NEXT: umov w11, v1.b[15]
+; SVE-FIXED-NEXT: umov w14, v1.b[4]
+; SVE-FIXED-NEXT: umov w16, v1.b[0]
+; SVE-FIXED-NEXT: umov w15, v1.b[8]
+; SVE-FIXED-NEXT: fmov s3, w8
+; SVE-FIXED-NEXT: umov w8, v1.b[7]
+; SVE-FIXED-NEXT: fmov s4, w9
+; SVE-FIXED-NEXT: umov w9, v1.b[11]
+; SVE-FIXED-NEXT: fmov s6, w12
+; SVE-FIXED-NEXT: umov w12, v1.b[3]
+; SVE-FIXED-NEXT: fmov s5, w10
+; SVE-FIXED-NEXT: umov w10, v1.b[1]
+; SVE-FIXED-NEXT: fmov s7, w13
+; SVE-FIXED-NEXT: umov w13, v1.b[13]
+; SVE-FIXED-NEXT: fmov s16, w14
+; SVE-FIXED-NEXT: fmov s18, w16
+; SVE-FIXED-NEXT: mov v4.s[1], w8
+; SVE-FIXED-NEXT: umov w8, v1.b[5]
+; SVE-FIXED-NEXT: mov v3.s[1], w11
+; SVE-FIXED-NEXT: mov v5.s[1], w9
+; SVE-FIXED-NEXT: mov v6.s[1], w12
+; SVE-FIXED-NEXT: umov w9, v1.b[9]
+; SVE-FIXED-NEXT: fmov s17, w15
+; SVE-FIXED-NEXT: mov v18.s[1], w10
+; SVE-FIXED-NEXT: mov z19.d, z2.d
+; SVE-FIXED-NEXT: mov v7.s[1], w13
+; SVE-FIXED-NEXT: mov z20.d, z2.d
+; SVE-FIXED-NEXT: mov z21.d, z2.d
+; SVE-FIXED-NEXT: mov v16.s[1], w8
+; SVE-FIXED-NEXT: ushll v3.2d, v3.2s, #0
+; SVE-FIXED-NEXT: ushll v4.2d, v4.2s, #0
+; SVE-FIXED-NEXT: ushll v5.2d, v5.2s, #0
+; SVE-FIXED-NEXT: ushll v6.2d, v6.2s, #0
+; SVE-FIXED-NEXT: mov v17.s[1], w9
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: ushll v18.2d, v18.2s, #0
+; SVE-FIXED-NEXT: mov z25.d, z2.d
+; SVE-FIXED-NEXT: ushll v7.2d, v7.2s, #0
+; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63
+; SVE-FIXED-NEXT: shl v4.2d, v4.2d, #63
+; SVE-FIXED-NEXT: ushll v16.2d, v16.2s, #0
+; SVE-FIXED-NEXT: shl v5.2d, v5.2d, #63
+; SVE-FIXED-NEXT: shl v6.2d, v6.2d, #63
+; SVE-FIXED-NEXT: mov z22.d, z2.d
+; SVE-FIXED-NEXT: mov z23.d, z2.d
+; SVE-FIXED-NEXT: add z19.d, z19.d, #6 // =0x6
+; SVE-FIXED-NEXT: shl v18.2d, v18.2d, #63
+; SVE-FIXED-NEXT: ushll v17.2d, v17.2s, #0
+; SVE-FIXED-NEXT: shl v7.2d, v7.2d, #63
+; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; SVE-FIXED-NEXT: cmlt v4.2d, v4.2d, #0
+; SVE-FIXED-NEXT: add z25.d, z25.d, #14 // =0xe
+; SVE-FIXED-NEXT: shl v16.2d, v16.2d, #63
+; SVE-FIXED-NEXT: cmlt v5.2d, v5.2d, #0
+; SVE-FIXED-NEXT: add z20.d, z20.d, #10 // =0xa
+; SVE-FIXED-NEXT: cmlt v6.2d, v6.2d, #0
+; SVE-FIXED-NEXT: add z21.d, z21.d, #2 // =0x2
+; SVE-FIXED-NEXT: mov z24.d, z2.d
+; SVE-FIXED-NEXT: shl v17.2d, v17.2d, #63
+; SVE-FIXED-NEXT: cmlt v18.2d, v18.2d, #0
+; SVE-FIXED-NEXT: cmlt v7.2d, v7.2d, #0
+; SVE-FIXED-NEXT: add z22.d, z22.d, #12 // =0xc
+; SVE-FIXED-NEXT: cmlt v16.2d, v16.2d, #0
+; SVE-FIXED-NEXT: add z23.d, z23.d, #4 // =0x4
+; SVE-FIXED-NEXT: and v3.16b, v3.16b, v25.16b
+; SVE-FIXED-NEXT: and v4.16b, v4.16b, v19.16b
+; SVE-FIXED-NEXT: and v5.16b, v5.16b, v20.16b
+; SVE-FIXED-NEXT: and v6.16b, v6.16b, v21.16b
+; SVE-FIXED-NEXT: cmlt v17.2d, v17.2d, #0
+; SVE-FIXED-NEXT: add z24.d, z24.d, #8 // =0x8
+; SVE-FIXED-NEXT: and v2.16b, v18.16b, v2.16b
+; SVE-FIXED-NEXT: and v7.16b, v7.16b, v22.16b
+; SVE-FIXED-NEXT: and v16.16b, v16.16b, v23.16b
+; SVE-FIXED-NEXT: cmhi v18.2d, v4.2d, v3.2d
+; SVE-FIXED-NEXT: shl v1.16b, v1.16b, #7
+; SVE-FIXED-NEXT: cmhi v19.2d, v6.2d, v5.2d
+; SVE-FIXED-NEXT: and v17.16b, v17.16b, v24.16b
+; SVE-FIXED-NEXT: cmhi v20.2d, v16.2d, v7.2d
+; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v18.16b
+; SVE-FIXED-NEXT: cmlt v1.16b, v1.16b, #0
+; SVE-FIXED-NEXT: mov v4.16b, v19.16b
+; SVE-FIXED-NEXT: cmhi v21.2d, v2.2d, v17.2d
+; SVE-FIXED-NEXT: umaxv b1, v1.16b
+; SVE-FIXED-NEXT: bsl v4.16b, v6.16b, v5.16b
+; SVE-FIXED-NEXT: mov v5.16b, v20.16b
+; SVE-FIXED-NEXT: bif v2.16b, v17.16b, v21.16b
+; SVE-FIXED-NEXT: bsl v5.16b, v16.16b, v7.16b
+; SVE-FIXED-NEXT: cmhi v6.2d, v4.2d, v3.2d
+; SVE-FIXED-NEXT: cmhi v7.2d, v2.2d, v5.2d
+; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v6.16b
+; SVE-FIXED-NEXT: bif v2.16b, v5.16b, v7.16b
+; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d
+; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT: cmhi d4, d2, d3
+; SVE-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT: fmov x8, d2
+; SVE-FIXED-NEXT: bfxil x9, x8, #0, #4
+; SVE-FIXED-NEXT: ldrb w8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %res = call i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru)
+ ret i8 %res
+}
+
+define i16 @extract_last_i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i16:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: // kill: def $d1 killed $d1 def $q1
+; NEON-FIXED-NEXT: umov w8, v1.b[6]
+; NEON-FIXED-NEXT: umov w9, v1.b[2]
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: umov w11, v1.b[4]
+; NEON-FIXED-NEXT: umov w12, v1.b[0]
+; NEON-FIXED-NEXT: umov w10, v1.b[7]
+; NEON-FIXED-NEXT: umov w13, v1.b[3]
+; NEON-FIXED-NEXT: umov w14, v1.b[5]
+; NEON-FIXED-NEXT: umov w15, v1.b[1]
+; NEON-FIXED-NEXT: shl v1.8b, v1.8b, #7
+; NEON-FIXED-NEXT: fmov s2, w8
+; NEON-FIXED-NEXT: adrp x8, .LCPI1_0
+; NEON-FIXED-NEXT: fmov s3, w9
+; NEON-FIXED-NEXT: fmov s4, w11
+; NEON-FIXED-NEXT: adrp x9, .LCPI1_1
+; NEON-FIXED-NEXT: ldr q6, [x8, :lo12:.LCPI1_0]
+; NEON-FIXED-NEXT: fmov s5, w12
+; NEON-FIXED-NEXT: adrp x8, .LCPI1_3
+; NEON-FIXED-NEXT: ldr q7, [x9, :lo12:.LCPI1_1]
+; NEON-FIXED-NEXT: mov v2.s[1], w10
+; NEON-FIXED-NEXT: mov v3.s[1], w13
+; NEON-FIXED-NEXT: adrp x10, .LCPI1_2
+; NEON-FIXED-NEXT: mov v4.s[1], w14
+; NEON-FIXED-NEXT: ldr q16, [x10, :lo12:.LCPI1_2]
+; NEON-FIXED-NEXT: ldr q17, [x8, :lo12:.LCPI1_3]
+; NEON-FIXED-NEXT: mov v5.s[1], w15
+; NEON-FIXED-NEXT: cmlt v1.8b, v1.8b, #0
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ushll v2.2d, v2.2s, #0
+; NEON-FIXED-NEXT: ushll v3.2d, v3.2s, #0
+; NEON-FIXED-NEXT: ushll v4.2d, v4.2s, #0
+; NEON-FIXED-NEXT: umaxv b1, v1.8b
+; NEON-FIXED-NEXT: ushll v5.2d, v5.2s, #0
+; NEON-FIXED-NEXT: shl v2.2d, v2.2d, #63
+; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63
+; NEON-FIXED-NEXT: shl v4.2d, v4.2d, #63
+; NEON-FIXED-NEXT: shl v5.2d, v5.2d, #63
+; NEON-FIXED-NEXT: cmlt v2.2d, v2.2d, #0
+; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT: cmlt v4.2d, v4.2d, #0
+; NEON-FIXED-NEXT: cmlt v5.2d, v5.2d, #0
+; NEON-FIXED-NEXT: and v2.16b, v2.16b, v6.16b
+; NEON-FIXED-NEXT: and v3.16b, v3.16b, v7.16b
+; NEON-FIXED-NEXT: and v4.16b, v4.16b, v16.16b
+; NEON-FIXED-NEXT: and v5.16b, v5.16b, v17.16b
+; NEON-FIXED-NEXT: cmhi v6.2d, v3.2d, v2.2d
+; NEON-FIXED-NEXT: cmhi v7.2d, v5.2d, v4.2d
+; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v6.16b
+; NEON-FIXED-NEXT: mov v3.16b, v7.16b
+; NEON-FIXED-NEXT: bsl v3.16b, v5.16b, v4.16b
+; NEON-FIXED-NEXT: cmhi v4.2d, v3.2d, v2.2d
+; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; NEON-FIXED-NEXT: cmhi d4, d2, d3
+; NEON-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT: fmov x8, d2
+; NEON-FIXED-NEXT: bfi x9, x8, #1, #3
+; NEON-FIXED-NEXT: ldrh w8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_i16:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: // kill: def $d1 killed $d1 def $q1
+; SVE-FIXED-NEXT: umov w8, v1.b[0]
+; SVE-FIXED-NEXT: umov w10, v1.b[6]
+; SVE-FIXED-NEXT: index z6.d, #0, #1
+; SVE-FIXED-NEXT: umov w11, v1.b[2]
+; SVE-FIXED-NEXT: umov w14, v1.b[4]
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: umov w9, v1.b[1]
+; SVE-FIXED-NEXT: umov w12, v1.b[7]
+; SVE-FIXED-NEXT: umov w13, v1.b[3]
+; SVE-FIXED-NEXT: fmov s2, w8
+; SVE-FIXED-NEXT: umov w8, v1.b[5]
+; SVE-FIXED-NEXT: fmov s3, w10
+; SVE-FIXED-NEXT: fmov s4, w11
+; SVE-FIXED-NEXT: fmov s5, w14
+; SVE-FIXED-NEXT: mov z7.d, z6.d
+; SVE-FIXED-NEXT: mov z16.d, z6.d
+; SVE-FIXED-NEXT: mov z17.d, z6.d
+; SVE-FIXED-NEXT: shl v1.8b, v1.8b, #7
+; SVE-FIXED-NEXT: mov v2.s[1], w9
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: mov v3.s[1], w12
+; SVE-FIXED-NEXT: mov v4.s[1], w13
+; SVE-FIXED-NEXT: mov v5.s[1], w8
+; SVE-FIXED-NEXT: add z7.d, z7.d, #2 // =0x2
+; SVE-FIXED-NEXT: add z17.d, z17.d, #6 // =0x6
+; SVE-FIXED-NEXT: add z16.d, z16.d, #4 // =0x4
+; SVE-FIXED-NEXT: cmlt v1.8b, v1.8b, #0
+; SVE-FIXED-NEXT: ushll v2.2d, v2.2s, #0
+; SVE-FIXED-NEXT: ushll v3.2d, v3.2s, #0
+; SVE-FIXED-NEXT: ushll v4.2d, v4.2s, #0
+; SVE-FIXED-NEXT: ushll v5.2d, v5.2s, #0
+; SVE-FIXED-NEXT: umaxv b1, v1.8b
+; SVE-FIXED-NEXT: shl v2.2d, v2.2d, #63
+; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63
+; SVE-FIXED-NEXT: shl v4.2d, v4.2d, #63
+; SVE-FIXED-NEXT: shl v5.2d, v5.2d, #63
+; SVE-FIXED-NEXT: cmlt v2.2d, v2.2d, #0
+; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; SVE-FIXED-NEXT: cmlt v4.2d, v4.2d, #0
+; SVE-FIXED-NEXT: cmlt v5.2d, v5.2d, #0
+; SVE-FIXED-NEXT: and v2.16b, v2.16b, v6.16b
+; SVE-FIXED-NEXT: and v3.16b, v3.16b, v17.16b
+; SVE-FIXED-NEXT: and v4.16b, v4.16b, v7.16b
+; SVE-FIXED-NEXT: and v5.16b, v5.16b, v16.16b
+; SVE-FIXED-NEXT: cmhi v6.2d, v4.2d, v3.2d
+; SVE-FIXED-NEXT: cmhi v7.2d, v2.2d, v5.2d
+; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v6.16b
+; SVE-FIXED-NEXT: bif v2.16b, v5.16b, v7.16b
+; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d
+; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT: cmhi d4, d2, d3
+; SVE-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT: fmov x8, d2
+; SVE-FIXED-NEXT: bfi x9, x8, #1, #3
+; SVE-FIXED-NEXT: ldrh w8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %res = call i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru)
+ ret i16 %res
+}
+
+define i32 @extract_last_i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru) {
+; NEON-FIXED-LABEL: extract_last_i32:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: ushll v2.4s, v1.4h, #0
+; NEON-FIXED-NEXT: adrp x8, .LCPI2_0
+; NEON-FIXED-NEXT: adrp x9, .LCPI2_1
+; NEON-FIXED-NEXT: ldr q4, [x8, :lo12:.LCPI2_0]
+; NEON-FIXED-NEXT: ldr q5, [x9, :lo12:.LCPI2_1]
+; NEON-FIXED-NEXT: shl v1.4h, v1.4h, #15
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: ushll2 v3.2d, v2.4s, #0
+; NEON-FIXED-NEXT: ushll v2.2d, v2.2s, #0
+; NEON-FIXED-NEXT: cmlt v1.4h, v1.4h, #0
+; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63
+; NEON-FIXED-NEXT: shl v2.2d, v2.2d, #63
+; NEON-FIXED-NEXT: umaxv h1, v1.4h
+; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; NEON-FIXED-NEXT: cmlt v2.2d, v2.2d, #0
+; NEON-FIXED-NEXT: and v3.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT: and v2.16b, v2.16b, v5.16b
+; NEON-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d
+; NEON-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT: bic v3.16b, v3.16b, v4.16b
+; NEON-FIXED-NEXT: ext v2.16b, v2.16b, v2.16b, #8
+; NEON-FIXED-NEXT: cmhi d4, d3, d2
+; NEON-FIXED-NEXT: bit v2.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT: fmov x8, d2
+; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
+; NEON-FIXED-NEXT: ldr w8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_i32:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: ushll v2.4s, v1.4h, #0
+; SVE-FIXED-NEXT: index z4.d, #0, #1
+; SVE-FIXED-NEXT: shl v1.4h, v1.4h, #15
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: ushll2 v3.2d, v2.4s, #0
+; SVE-FIXED-NEXT: ushll v2.2d, v2.2s, #0
+; SVE-FIXED-NEXT: cmlt v1.4h, v1.4h, #0
+; SVE-FIXED-NEXT: mov z5.d, z4.d
+; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63
+; SVE-FIXED-NEXT: shl v2.2d, v2.2d, #63
+; SVE-FIXED-NEXT: umaxv h1, v1.4h
+; SVE-FIXED-NEXT: add z5.d, z5.d, #2 // =0x2
+; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0
+; SVE-FIXED-NEXT: cmlt v2.2d, v2.2d, #0
+; SVE-FIXED-NEXT: and v2.16b, v2.16b, v4.16b
+; SVE-FIXED-NEXT: and v3.16b, v3.16b, v5.16b
+; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d
+; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT: bic v3.16b, v3.16b, v4.16b
+; SVE-FIXED-NEXT: ext v2.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT: cmhi d4, d3, d2
+; SVE-FIXED-NEXT: bit v2.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT: fmov x8, d2
+; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
+; SVE-FIXED-NEXT: ldr w8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %res = call i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru)
+ ret i32 %res
+}
+
+define i64 @extract_last_i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru) {
+; CHECK-LABEL: extract_last_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: ushll v3.2d, v1.2s, #0
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: fmov d2, xzr
+; CHECK-NEXT: fmov d4, x8
+; CHECK-NEXT: shl v1.2s, v1.2s, #31
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: shl v3.2d, v3.2d, #63
+; CHECK-NEXT: cmlt v1.2s, v1.2s, #0
+; CHECK-NEXT: cmlt v3.2d, v3.2d, #0
+; CHECK-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT: and v3.8b, v3.8b, v4.8b
+; CHECK-NEXT: cmhi d2, d2, d3
+; CHECK-NEXT: bic v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: fmov x8, d2
+; CHECK-NEXT: orr x8, x9, x8, lsl #3
+; CHECK-NEXT: fmov w9, s1
+; CHECK-NEXT: ldr x8, [x8]
+; CHECK-NEXT: tst w9, #0x1
+; CHECK-NEXT: csel x0, x8, x0, ne
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru)
+ ret i64 %res
+}
+
+define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) #0 {
+; CHECK-LABEL: extract_last_i8_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: index z1.d, #0, #1
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: mov z3.d, #0 // =0x0
+; CHECK-NEXT: punpkhi p4.h, p0.b
+; CHECK-NEXT: punpklo p5.h, p2.b
+; CHECK-NEXT: punpkhi p1.h, p4.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: punpkhi p3.h, p2.b
+; CHECK-NEXT: punpklo p2.h, p4.b
+; CHECK-NEXT: incd z2.d
+; CHECK-NEXT: incd z5.d, all, mul #2
+; CHECK-NEXT: punpklo p4.h, p5.b
+; CHECK-NEXT: incd z6.d, all, mul #4
+; CHECK-NEXT: punpkhi p6.h, p1.b
+; CHECK-NEXT: punpkhi p7.h, p3.b
+; CHECK-NEXT: sel z1.d, p4, z1.d, z3.d
+; CHECK-NEXT: mov z4.d, z2.d
+; CHECK-NEXT: mov z7.d, z2.d
+; CHECK-NEXT: mov z25.d, z5.d
+; CHECK-NEXT: punpkhi p5.h, p5.b
+; CHECK-NEXT: punpkhi p4.h, p2.b
+; CHECK-NEXT: incd z4.d, all, mul #2
+; CHECK-NEXT: incd z25.d, all, mul #4
+; CHECK-NEXT: incd z7.d, all, mul #4
+; CHECK-NEXT: punpklo p3.h, p3.b
+; CHECK-NEXT: sel z2.d, p5, z2.d, z3.d
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: punpklo p2.h, p2.b
+; CHECK-NEXT: mov z24.d, z4.d
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: sel z5.d, p3, z5.d, z3.d
+; CHECK-NEXT: sel z4.d, p7, z4.d, z3.d
+; CHECK-NEXT: sel z6.d, p2, z6.d, z3.d
+; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: sel z25.d, p1, z25.d, z3.d
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: incd z24.d, all, mul #4
+; CHECK-NEXT: umax z1.d, p1/m, z1.d, z6.d
+; CHECK-NEXT: sel z24.d, p6, z24.d, z3.d
+; CHECK-NEXT: mov z3.d, p4/m, z7.d
+; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: umax z4.d, p1/m, z4.d, z24.d
+; CHECK-NEXT: umax z2.d, p1/m, z2.d, z3.d
+; CHECK-NEXT: movprfx z3, z5
+; CHECK-NEXT: umax z3.d, p1/m, z3.d, z25.d
+; CHECK-NEXT: umax z2.d, p1/m, z2.d, z4.d
+; CHECK-NEXT: umax z1.d, p1/m, z1.d, z3.d
+; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d
+; CHECK-NEXT: umaxv d1, p1, z1.d
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: whilels p1.b, xzr, x8
+; CHECK-NEXT: ptest p0, p0.b
+; CHECK-NEXT: lastb w8, p1, z0.b
+; CHECK-NEXT: csel w0, w8, w0, ne
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru)
+ ret i8 %res
+}
+
+define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) #0 {
+; CHECK-LABEL: extract_last_i16_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: index z1.d, #0, #1
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: mov z5.d, #0 // =0x0
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: punpkhi p3.h, p1.b
+; CHECK-NEXT: punpkhi p4.h, p2.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpklo p2.h, p2.b
+; CHECK-NEXT: incd z2.d
+; CHECK-NEXT: incd z3.d, all, mul #2
+; CHECK-NEXT: sel z1.d, p2, z1.d, z5.d
+; CHECK-NEXT: mov z4.d, z2.d
+; CHECK-NEXT: sel z2.d, p4, z2.d, z5.d
+; CHECK-NEXT: sel z3.d, p1, z3.d, z5.d
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: incd z4.d, all, mul #2
+; CHECK-NEXT: umax z1.d, p1/m, z1.d, z3.d
+; CHECK-NEXT: sel z4.d, p3, z4.d, z5.d
+; CHECK-NEXT: umax z2.d, p1/m, z2.d, z4.d
+; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d
+; CHECK-NEXT: umaxv d1, p1, z1.d
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: whilels p1.h, xzr, x8
+; CHECK-NEXT: lastb w8, p1, z0.h
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ptest p1, p0.b
+; CHECK-NEXT: csel w0, w8, w0, ne
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
+ ret i16 %res
+}
+
+define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru) #0 {
+; CHECK-LABEL: extract_last_i32_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z1.d, #0, #1
+; CHECK-NEXT: mov z3.d, #0 // =0x0
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: sel z1.d, p2, z1.d, z3.d
+; CHECK-NEXT: incd z2.d
+; CHECK-NEXT: sel z2.d, p1, z2.d, z3.d
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d
+; CHECK-NEXT: umaxv d1, p1, z1.d
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: whilels p1.s, xzr, x8
+; CHECK-NEXT: lastb w8, p1, z0.s
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ptest p1, p0.b
+; CHECK-NEXT: csel w0, w8, w0, ne
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru)
+ ret i32 %res
+}
+
+define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) #0 {
+; CHECK-LABEL: extract_last_i64_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z1.d, #0, #1
+; CHECK-NEXT: mov z2.d, #0 // =0x0
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sel z1.d, p0, z1.d, z2.d
+; CHECK-NEXT: umaxv d1, p1, z1.d
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: whilels p2.d, xzr, x8
+; CHECK-NEXT: ptest p1, p0.b
+; CHECK-NEXT: lastb x8, p2, z0.d
+; CHECK-NEXT: csel x0, x8, x0, ne
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru)
+ ret i64 %res
+}
+
+declare i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
+declare i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
----------------
paulwalker-arm wrote:
Any reason for not having floating point tests?
https://github.com/llvm/llvm-project/pull/113587
More information about the llvm-commits
mailing list