[llvm] fcd058a - [SVE][CodeGen] Restructure SVE fixed length tests to use update_llc_test_checks.
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 16 16:32:30 PDT 2022
Author: Paul Walker
Date: 2022-06-17T00:30:56+01:00
New Revision: fcd058acc95c14ae5202a22548c5e40287e593c5
URL: https://github.com/llvm/llvm-project/commit/fcd058acc95c14ae5202a22548c5e40287e593c5
DIFF: https://github.com/llvm/llvm-project/commit/fcd058acc95c14ae5202a22548c5e40287e593c5.diff
LOG: [SVE][CodeGen] Restructure SVE fixed length tests to use update_llc_test_checks.
Most tests have been updated to make use of vscale_range to reduce
the number of RUN lines. For the remaining RUN lines the check
prefixes have been updated to ensure the original expectation of
the manual CHECK lines is maintained after update_llc_test_checks
is run.
Added:
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll
Modified:
llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
Removed:
llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll
################################################################################
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
index 201f4e6945ee..f2dc770242a6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
@@ -1,54 +1,42 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; CLZ
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @ctlz_v8i8(<8 x i8> %op) #0 {
+define <8 x i8> @ctlz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctlz_v8i8:
-; CHECK: clz v0.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: clz v0.8b, v0.8b
+; CHECK-NEXT: ret
%res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op)
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @ctlz_v16i8(<16 x i8> %op) #0 {
+define <16 x i8> @ctlz_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctlz_v16i8:
-; CHECK: clz v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: clz v0.16b, v0.16b
+; CHECK-NEXT: ret
%res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op)
ret <16 x i8> %res
}
-define void @ctlz_v32i8(<32 x i8>* %a) #0 {
+define void @ctlz_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: ctlz_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: clz z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op)
store <32 x i8> %res, <32 x i8>* %a
@@ -56,49 +44,53 @@ define void @ctlz_v32i8(<32 x i8>* %a) #0 {
}
define void @ctlz_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: ctlz_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: ctlz_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: clz z0.b, p0/m, z0.b
+; VBITS_GE_256-NEXT: clz z1.b, p0/m, z1.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP_LO]].b
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_512-LABEL: ctlz_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: clz z0.b, p0/m, z0.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %op)
store <64 x i8> %res, <64 x i8>* %a
ret void
}
-define void @ctlz_v128i8(<128 x i8>* %a) #0 {
+define void @ctlz_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: ctlz_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: clz z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call <128 x i8> @llvm.ctlz.v128i8(<128 x i8> %op)
store <128 x i8> %res, <128 x i8>* %a
ret void
}
-define void @ctlz_v256i8(<256 x i8>* %a) #0 {
+define void @ctlz_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: ctlz_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: clz z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call <256 x i8> @llvm.ctlz.v256i8(<256 x i8> %op)
store <256 x i8> %res, <256 x i8>* %a
@@ -106,30 +98,33 @@ define void @ctlz_v256i8(<256 x i8>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @ctlz_v4i16(<4 x i16> %op) #0 {
+define <4 x i16> @ctlz_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctlz_v4i16:
-; CHECK: clz v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: clz v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op)
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @ctlz_v8i16(<8 x i16> %op) #0 {
+define <8 x i16> @ctlz_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctlz_v8i16:
-; CHECK: clz v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: clz v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op)
ret <8 x i16> %res
}
-define void @ctlz_v16i16(<16 x i16>* %a) #0 {
+define void @ctlz_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: ctlz_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: clz z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op)
store <16 x i16> %res, <16 x i16>* %a
@@ -137,49 +132,53 @@ define void @ctlz_v16i16(<16 x i16>* %a) #0 {
}
define void @ctlz_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: ctlz_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ctlz_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: clz z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: clz z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: ctlz_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: clz z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %op)
store <32 x i16> %res, <32 x i16>* %a
ret void
}
-define void @ctlz_v64i16(<64 x i16>* %a) #0 {
+define void @ctlz_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: ctlz_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: clz z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call <64 x i16> @llvm.ctlz.v64i16(<64 x i16> %op)
store <64 x i16> %res, <64 x i16>* %a
ret void
}
-define void @ctlz_v128i16(<128 x i16>* %a) #0 {
+define void @ctlz_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: ctlz_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: clz z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call <128 x i16> @llvm.ctlz.v128i16(<128 x i16> %op)
store <128 x i16> %res, <128 x i16>* %a
@@ -187,30 +186,33 @@ define void @ctlz_v128i16(<128 x i16>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @ctlz_v2i32(<2 x i32> %op) #0 {
+define <2 x i32> @ctlz_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctlz_v2i32:
-; CHECK: clz v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: clz v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op)
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @ctlz_v4i32(<4 x i32> %op) #0 {
+define <4 x i32> @ctlz_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctlz_v4i32:
-; CHECK: clz v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: clz v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op)
ret <4 x i32> %res
}
-define void @ctlz_v8i32(<8 x i32>* %a) #0 {
+define void @ctlz_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: ctlz_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: clz z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op)
store <8 x i32> %res, <8 x i32>* %a
@@ -218,80 +220,91 @@ define void @ctlz_v8i32(<8 x i32>* %a) #0 {
}
define void @ctlz_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: ctlz_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ctlz_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: clz z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: clz z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: ctlz_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: clz z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %op)
store <16 x i32> %res, <16 x i32>* %a
ret void
}
-define void @ctlz_v32i32(<32 x i32>* %a) #0 {
+define void @ctlz_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: ctlz_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: clz z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call <32 x i32> @llvm.ctlz.v32i32(<32 x i32> %op)
store <32 x i32> %res, <32 x i32>* %a
ret void
}
-define void @ctlz_v64i32(<64 x i32>* %a) #0 {
+define void @ctlz_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: ctlz_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: clz z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call <64 x i32> @llvm.ctlz.v64i32(<64 x i32> %op)
store <64 x i32> %res, <64 x i32>* %a
ret void
}
-define <1 x i64> @ctlz_v1i64(<1 x i64> %op) #0 {
+define <1 x i64> @ctlz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctlz_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: clz z0.d, [[PG]]/m, z0.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: clz z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op)
ret <1 x i64> %res
}
-define <2 x i64> @ctlz_v2i64(<2 x i64> %op) #0 {
+define <2 x i64> @ctlz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctlz_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: clz z0.d, [[PG]]/m, z0.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: clz z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op)
ret <2 x i64> %res
}
-define void @ctlz_v4i64(<4 x i64>* %a) #0 {
+define void @ctlz_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: ctlz_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: clz z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op)
store <4 x i64> %res, <4 x i64>* %a
@@ -299,49 +312,53 @@ define void @ctlz_v4i64(<4 x i64>* %a) #0 {
}
define void @ctlz_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: ctlz_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ctlz_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: clz z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: clz z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: ctlz_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: clz z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %op)
store <8 x i64> %res, <8 x i64>* %a
ret void
}
-define void @ctlz_v16i64(<16 x i64>* %a) #0 {
+define void @ctlz_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: ctlz_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: clz z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call <16 x i64> @llvm.ctlz.v16i64(<16 x i64> %op)
store <16 x i64> %res, <16 x i64>* %a
ret void
}
-define void @ctlz_v32i64(<32 x i64>* %a) #0 {
+define void @ctlz_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: ctlz_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: clz z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call <32 x i64> @llvm.ctlz.v32i64(<32 x i64> %op)
store <32 x i64> %res, <32 x i64>* %a
@@ -353,30 +370,33 @@ define void @ctlz_v32i64(<32 x i64>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @ctpop_v8i8(<8 x i8> %op) #0 {
+define <8 x i8> @ctpop_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v8i8:
-; CHECK: cnt v0.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: ret
%res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op)
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @ctpop_v16i8(<16 x i8> %op) #0 {
+define <16 x i8> @ctpop_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v16i8:
-; CHECK: cnt v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cnt v0.16b, v0.16b
+; CHECK-NEXT: ret
%res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op)
ret <16 x i8> %res
}
-define void @ctpop_v32i8(<32 x i8>* %a) #0 {
+define void @ctpop_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: cnt [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: cnt z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op)
store <32 x i8> %res, <32 x i8>* %a
@@ -384,49 +404,53 @@ define void @ctpop_v32i8(<32 x i8>* %a) #0 {
}
define void @ctpop_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: ctpop_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: cnt [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: ctpop_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: cnt z0.b, p0/m, z0.b
+; VBITS_GE_256-NEXT: cnt z1.b, p0/m, z1.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: cnt [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP_LO]].b
-; VBITS_EQ_256-DAG: cnt [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_512-LABEL: ctpop_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: cnt z0.b, p0/m, z0.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %op)
store <64 x i8> %res, <64 x i8>* %a
ret void
}
-define void @ctpop_v128i8(<128 x i8>* %a) #0 {
+define void @ctpop_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: ctpop_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: cnt [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: cnt z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call <128 x i8> @llvm.ctpop.v128i8(<128 x i8> %op)
store <128 x i8> %res, <128 x i8>* %a
ret void
}
-define void @ctpop_v256i8(<256 x i8>* %a) #0 {
+define void @ctpop_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: ctpop_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: cnt [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: cnt z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call <256 x i8> @llvm.ctpop.v256i8(<256 x i8> %op)
store <256 x i8> %res, <256 x i8>* %a
@@ -434,32 +458,35 @@ define void @ctpop_v256i8(<256 x i8>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @ctpop_v4i16(<4 x i16> %op) #0 {
+define <4 x i16> @ctpop_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v4i16:
-; CHECK: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op)
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @ctpop_v8i16(<8 x i16> %op) #0 {
+define <8 x i16> @ctpop_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v8i16:
-; CHECK: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cnt v0.16b, v0.16b
+; CHECK-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op)
ret <8 x i16> %res
}
-define void @ctpop_v16i16(<16 x i16>* %a) #0 {
+define void @ctpop_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: cnt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: cnt z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op)
store <16 x i16> %res, <16 x i16>* %a
@@ -467,49 +494,53 @@ define void @ctpop_v16i16(<16 x i16>* %a) #0 {
}
define void @ctpop_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: ctpop_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: cnt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: cnt [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: cnt [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ctpop_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: cnt z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: cnt z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: ctpop_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: cnt z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %op)
store <32 x i16> %res, <32 x i16>* %a
ret void
}
-define void @ctpop_v64i16(<64 x i16>* %a) #0 {
+define void @ctpop_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: ctpop_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: cnt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: cnt z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call <64 x i16> @llvm.ctpop.v64i16(<64 x i16> %op)
store <64 x i16> %res, <64 x i16>* %a
ret void
}
-define void @ctpop_v128i16(<128 x i16>* %a) #0 {
+define void @ctpop_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: ctpop_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: cnt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: cnt z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call <128 x i16> @llvm.ctpop.v128i16(<128 x i16> %op)
store <128 x i16> %res, <128 x i16>* %a
@@ -517,34 +548,37 @@ define void @ctpop_v128i16(<128 x i16>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @ctpop_v2i32(<2 x i32> %op) #0 {
+define <2 x i32> @ctpop_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v2i32:
-; CHECK: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-NEXT: uaddlp v0.2s, v0.4h
+; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op)
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @ctpop_v4i32(<4 x i32> %op) #0 {
+define <4 x i32> @ctpop_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v4i32:
-; CHECK: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: uaddlp v0.4s, v0.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cnt v0.16b, v0.16b
+; CHECK-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-NEXT: uaddlp v0.4s, v0.8h
+; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op)
ret <4 x i32> %res
}
-define void @ctpop_v8i32(<8 x i32>* %a) #0 {
+define void @ctpop_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: cnt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: cnt z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op)
store <8 x i32> %res, <8 x i32>* %a
@@ -552,49 +586,53 @@ define void @ctpop_v8i32(<8 x i32>* %a) #0 {
}
define void @ctpop_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: ctpop_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: cnt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: cnt [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: cnt [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ctpop_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: cnt z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: cnt z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: ctpop_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: cnt z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %op)
store <16 x i32> %res, <16 x i32>* %a
ret void
}
-define void @ctpop_v32i32(<32 x i32>* %a) #0 {
+define void @ctpop_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: ctpop_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: cnt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: cnt z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call <32 x i32> @llvm.ctpop.v32i32(<32 x i32> %op)
store <32 x i32> %res, <32 x i32>* %a
ret void
}
-define void @ctpop_v64i32(<64 x i32>* %a) #0 {
+define void @ctpop_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: ctpop_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: cnt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: cnt z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call <64 x i32> @llvm.ctpop.v64i32(<64 x i32> %op)
store <64 x i32> %res, <64 x i32>* %a
@@ -602,36 +640,39 @@ define void @ctpop_v64i32(<64 x i32>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @ctpop_v1i64(<1 x i64> %op) #0 {
+define <1 x i64> @ctpop_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v1i64:
-; CHECK: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-NEXT: uaddlp v0.1d, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-NEXT: uaddlp v0.2s, v0.4h
+; CHECK-NEXT: uaddlp v0.1d, v0.2s
+; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op)
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @ctpop_v2i64(<2 x i64> %op) #0 {
+define <2 x i64> @ctpop_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v2i64:
-; CHECK: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: uaddlp v0.4s, v0.8h
-; CHECK-NEXT: uaddlp v0.2d, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cnt v0.16b, v0.16b
+; CHECK-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-NEXT: uaddlp v0.4s, v0.8h
+; CHECK-NEXT: uaddlp v0.2d, v0.4s
+; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op)
ret <2 x i64> %res
}
-define void @ctpop_v4i64(<4 x i64>* %a) #0 {
+define void @ctpop_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: cnt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: cnt z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op)
store <4 x i64> %res, <4 x i64>* %a
@@ -639,49 +680,53 @@ define void @ctpop_v4i64(<4 x i64>* %a) #0 {
}
define void @ctpop_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: ctpop_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: cnt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: cnt [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: cnt [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ctpop_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: cnt z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: cnt z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: ctpop_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: cnt z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %op)
store <8 x i64> %res, <8 x i64>* %a
ret void
}
-define void @ctpop_v16i64(<16 x i64>* %a) #0 {
+define void @ctpop_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: ctpop_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: cnt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: cnt z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call <16 x i64> @llvm.ctpop.v16i64(<16 x i64> %op)
store <16 x i64> %res, <16 x i64>* %a
ret void
}
-define void @ctpop_v32i64(<32 x i64>* %a) #0 {
+define void @ctpop_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: ctpop_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: cnt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: cnt z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call <32 x i64> @llvm.ctpop.v32i64(<32 x i64> %op)
store <32 x i64> %res, <32 x i64>* %a
@@ -692,34 +737,39 @@ define void @ctpop_v32i64(<32 x i64>* %a) #0 {
; Count trailing zeros
;
-define <8 x i8> @cttz_v8i8(<8 x i8> %op) #0 {
+define <8 x i8> @cttz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: cttz_v8i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
-; CHECK-NEXT: rbit z[[RBIT:[0-9]+]].b, p0/m, z0.b
-; CHECK-NEXT: clz v0.8b, v[[RBIT]].8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: rbit z0.b, p0/m, z0.b
+; CHECK-NEXT: clz v0.8b, v0.8b
+; CHECK-NEXT: ret
%res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op)
ret <8 x i8> %res
}
-define <16 x i8> @cttz_v16i8(<16 x i8> %op) #0 {
+define <16 x i8> @cttz_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: cttz_v16i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
-; CHECK-NEXT: rbit z[[RBIT:[0-9]+]].b, p0/m, z0.b
-; CHECK-NEXT: clz v0.16b, v[[RBIT]].16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: rbit z0.b, p0/m, z0.b
+; CHECK-NEXT: clz v0.16b, v0.16b
+; CHECK-NEXT: ret
%res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op)
ret <16 x i8> %res
}
-define void @cttz_v32i8(<32 x i8>* %a) #0 {
+define void @cttz_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: cttz_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RBIT:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; CHECK-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[RBIT]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.b, p0/m, z0.b
+; CHECK-NEXT: clz z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op)
store <32 x i8> %res, <32 x i8>* %a
@@ -727,88 +777,97 @@ define void @cttz_v32i8(<32 x i8>* %a) #0 {
}
define void @cttz_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: cttz_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RBIT:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[RBIT]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: cttz_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: rbit z0.b, p0/m, z0.b
+; VBITS_GE_256-NEXT: rbit z1.b, p0/m, z1.b
+; VBITS_GE_256-NEXT: clz z0.b, p0/m, z0.b
+; VBITS_GE_256-NEXT: clz z1.b, p0/m, z1.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: rbit [[RBIT_LO:z[0-9]+]].b, [[PG]]/m, [[OP_LO]].b
-; VBITS_EQ_256-DAG: rbit [[RBIT_HI:z[0-9]+]].b, [[PG]]/m, [[OP_HI]].b
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[RBIT_LO]].b
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[RBIT_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_512-LABEL: cttz_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: rbit z0.b, p0/m, z0.b
+; VBITS_GE_512-NEXT: clz z0.b, p0/m, z0.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %op)
store <64 x i8> %res, <64 x i8>* %a
ret void
}
-define void @cttz_v128i8(<128 x i8>* %a) #0 {
+define void @cttz_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: cttz_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RBIT:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[RBIT]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.b, p0/m, z0.b
+; CHECK-NEXT: clz z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call <128 x i8> @llvm.cttz.v128i8(<128 x i8> %op)
store <128 x i8> %res, <128 x i8>* %a
ret void
}
-define void @cttz_v256i8(<256 x i8>* %a) #0 {
+define void @cttz_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: cttz_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RBIT:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[RBIT]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.b, p0/m, z0.b
+; CHECK-NEXT: clz z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call <256 x i8> @llvm.cttz.v256i8(<256 x i8> %op)
store <256 x i8> %res, <256 x i8>* %a
ret void
}
-define <4 x i16> @cttz_v4i16(<4 x i16> %op) #0 {
+define <4 x i16> @cttz_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: cttz_v4i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK-NEXT: rbit z[[RBIT:[0-9]+]].h, p0/m, z0.h
-; CHECK-NEXT: clz v0.4h, v[[RBIT]].4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: rbit z0.h, p0/m, z0.h
+; CHECK-NEXT: clz v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op)
ret <4 x i16> %res
}
-define <8 x i16> @cttz_v8i16(<8 x i16> %op) #0 {
+define <8 x i16> @cttz_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: cttz_v8i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK-NEXT: rbit z[[RBIT:[0-9]+]].h, p0/m, z0.h
-; CHECK-NEXT: clz v0.8h, v[[RBIT]].8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: rbit z0.h, p0/m, z0.h
+; CHECK-NEXT: clz v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op)
ret <8 x i16> %res
}
-define void @cttz_v16i16(<16 x i16>* %a) #0 {
+define void @cttz_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: cttz_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RBIT:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[RBIT]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.h, p0/m, z0.h
+; CHECK-NEXT: clz z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op)
store <16 x i16> %res, <16 x i16>* %a
@@ -816,54 +875,58 @@ define void @cttz_v16i16(<16 x i16>* %a) #0 {
}
define void @cttz_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: cttz_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RBIT:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[RBIT]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: rbit [[RBIT_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: rbit [[RBIT_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[RBIT_LO]].h
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[RBIT_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: cttz_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: rbit z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: rbit z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: clz z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: clz z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: cttz_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: rbit z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: clz z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %op)
store <32 x i16> %res, <32 x i16>* %a
ret void
}
-define void @cttz_v64i16(<64 x i16>* %a) #0 {
+define void @cttz_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: cttz_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RBIT:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[RBIT]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.h, p0/m, z0.h
+; CHECK-NEXT: clz z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call <64 x i16> @llvm.cttz.v64i16(<64 x i16> %op)
store <64 x i16> %res, <64 x i16>* %a
ret void
}
-define void @cttz_v128i16(<128 x i16>* %a) #0 {
+define void @cttz_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: cttz_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RBIT:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[RBIT]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.h, p0/m, z0.h
+; CHECK-NEXT: clz z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call <128 x i16> @llvm.cttz.v128i16(<128 x i16> %op)
store <128 x i16> %res, <128 x i16>* %a
@@ -871,35 +934,40 @@ define void @cttz_v128i16(<128 x i16>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @cttz_v2i32(<2 x i32> %op) #0 {
+define <2 x i32> @cttz_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: cttz_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK-NEXT: rbit z[[RBIT:[0-9]+]].s, p0/m, z0.s
-; CHECK-NEXT: clz v0.2s, v[[RBIT]].2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: rbit z0.s, p0/m, z0.s
+; CHECK-NEXT: clz v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op)
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @cttz_v4i32(<4 x i32> %op) #0 {
+define <4 x i32> @cttz_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: cttz_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK-NEXT: rbit z[[RBIT:[0-9]+]].s, p0/m, z0.s
-; CHECK-NEXT: clz v0.4s, v[[RBIT]].4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: rbit z0.s, p0/m, z0.s
+; CHECK-NEXT: clz v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op)
ret <4 x i32> %res
}
-define void @cttz_v8i32(<8 x i32>* %a) #0 {
+define void @cttz_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: cttz_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RBIT:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[RBIT]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.s, p0/m, z0.s
+; CHECK-NEXT: clz z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op)
store <8 x i32> %res, <8 x i32>* %a
@@ -907,88 +975,99 @@ define void @cttz_v8i32(<8 x i32>* %a) #0 {
}
define void @cttz_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: cttz_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RBIT:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[RBIT]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: rbit [[RBIT_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: rbit [[RBIT_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[RBIT_LO]].s
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[RBIT_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: cttz_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: rbit z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: rbit z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: clz z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: clz z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: cttz_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: rbit z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: clz z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %op)
store <16 x i32> %res, <16 x i32>* %a
ret void
}
-define void @cttz_v32i32(<32 x i32>* %a) #0 {
+define void @cttz_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: cttz_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RBIT:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[RBIT]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.s, p0/m, z0.s
+; CHECK-NEXT: clz z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call <32 x i32> @llvm.cttz.v32i32(<32 x i32> %op)
store <32 x i32> %res, <32 x i32>* %a
ret void
}
-define void @cttz_v64i32(<64 x i32>* %a) #0 {
+define void @cttz_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: cttz_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RBIT:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[RBIT]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.s, p0/m, z0.s
+; CHECK-NEXT: clz z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call <64 x i32> @llvm.cttz.v64i32(<64 x i32> %op)
store <64 x i32> %res, <64 x i32>* %a
ret void
}
-define <1 x i64> @cttz_v1i64(<1 x i64> %op) #0 {
+define <1 x i64> @cttz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: cttz_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: rbit [[RBIT:z[0-9]+]].d, [[PG]]/m, z0.d
-; CHECK-NEXT: clz z0.d, [[PG]]/m, [[RBIT]].d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: rbit z0.d, p0/m, z0.d
+; CHECK-NEXT: clz z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op)
ret <1 x i64> %res
}
-define <2 x i64> @cttz_v2i64(<2 x i64> %op) #0 {
+define <2 x i64> @cttz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: cttz_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: rbit [[RBIT:z[0-9]+]].d, [[PG]]/m, z0.d
-; CHECK-NEXT: clz z0.d, [[PG]]/m, [[RBIT]].d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: rbit z0.d, p0/m, z0.d
+; CHECK-NEXT: clz z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op)
ret <2 x i64> %res
}
-define void @cttz_v4i64(<4 x i64>* %a) #0 {
+define void @cttz_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: cttz_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RBIT:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[RBIT]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.d, p0/m, z0.d
+; CHECK-NEXT: clz z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op)
store <4 x i64> %res, <4 x i64>* %a
@@ -996,54 +1075,58 @@ define void @cttz_v4i64(<4 x i64>* %a) #0 {
}
define void @cttz_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: cttz_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RBIT:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[RBIT]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: rbit [[RBIT_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: rbit [[RBIT_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[RBIT_LO]].d
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[RBIT_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: cttz_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: rbit z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: rbit z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: clz z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: clz z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: cttz_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: rbit z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: clz z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %op)
store <8 x i64> %res, <8 x i64>* %a
ret void
}
-define void @cttz_v16i64(<16 x i64>* %a) #0 {
+define void @cttz_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: cttz_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RBIT:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[RBIT]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.d, p0/m, z0.d
+; CHECK-NEXT: clz z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call <16 x i64> @llvm.cttz.v16i64(<16 x i64> %op)
store <16 x i64> %res, <16 x i64>* %a
ret void
}
-define void @cttz_v32i64(<32 x i64>* %a) #0 {
+define void @cttz_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: cttz_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RBIT:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[RBIT]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.d, p0/m, z0.d
+; CHECK-NEXT: clz z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call <32 x i64> @llvm.cttz.v32i64(<32 x i64> %op)
store <32 x i64> %res, <32 x i64>* %a
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
index eb4186cd6aef..45008aa7abfd 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
@@ -1,31 +1,17 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
; Don't use SVE for 64-bit vectors.
-define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) #0 {
+define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v4i16:
-; CHECK: ldr d0, [x0]
-; CHECK-NEXT: str d0, [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: str d0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <4 x i16>, <4 x i16>* %a
%cast = bitcast <4 x i16> %load to <4 x half>
store volatile <4 x half> %cast, <4 x half>* %b
@@ -33,23 +19,25 @@ define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
-define void @bitcast_v8i16(<8 x i16> *%a, <8 x half>* %b) #0 {
+define void @bitcast_v8i16(<8 x i16> *%a, <8 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v8i16:
-; CHECK: ldr q0, [x0]
-; CHECK-NEXT: str q0, [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: str q0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <8 x i16>, <8 x i16>* %a
%cast = bitcast <8 x i16> %load to <8 x half>
store volatile <8 x half> %cast, <8 x half>* %b
ret void
}
-define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) #0 {
+define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <16 x i16>, <16 x i16>* %a
%cast = bitcast <16 x i16> %load to <16 x half>
store volatile <16 x half> %cast, <16 x half>* %b
@@ -57,35 +45,48 @@ define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) #0 {
}
define void @bitcast_v32i16(<32 x i16> *%a, <32 x half>* %b) #0 {
-; CHECK-LABEL: bitcast_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: bitcast_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: bitcast_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%load = load volatile <32 x i16>, <32 x i16>* %a
%cast = bitcast <32 x i16> %load to <32 x half>
store volatile <32 x half> %cast, <32 x half>* %b
ret void
}
-define void @bitcast_v64i16(<64 x i16> *%a, <64 x half>* %b) #0 {
+define void @bitcast_v64i16(<64 x i16> *%a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: bitcast_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <64 x i16>, <64 x i16>* %a
%cast = bitcast <64 x i16> %load to <64 x half>
store volatile <64 x half> %cast, <64 x half>* %b
ret void
}
-define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) #0 {
+define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: bitcast_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <128 x i16>, <128 x i16>* %a
%cast = bitcast <128 x i16> %load to <128 x half>
store volatile <128 x half> %cast, <128 x half>* %b
@@ -93,11 +94,12 @@ define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) #0 {
+define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v2i32:
-; CHECK: ldr d0, [x0]
-; CHECK-NEXT: str d0, [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: str d0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <2 x i32>, <2 x i32>* %a
%cast = bitcast <2 x i32> %load to <2 x float>
store volatile <2 x float> %cast, <2 x float>* %b
@@ -105,23 +107,25 @@ define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
-define void @bitcast_v4i32(<4 x i32> *%a, <4 x float>* %b) #0 {
+define void @bitcast_v4i32(<4 x i32> *%a, <4 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v4i32:
-; CHECK: ldr q0, [x0]
-; CHECK-NEXT: str q0, [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: str q0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <4 x i32>, <4 x i32>* %a
%cast = bitcast <4 x i32> %load to <4 x float>
store volatile <4 x float> %cast, <4 x float>* %b
ret void
}
-define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) #0 {
+define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <8 x i32>, <8 x i32>* %a
%cast = bitcast <8 x i32> %load to <8 x float>
store volatile <8 x float> %cast, <8 x float>* %b
@@ -129,35 +133,48 @@ define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) #0 {
}
define void @bitcast_v16i32(<16 x i32> *%a, <16 x float>* %b) #0 {
-; CHECK-LABEL: bitcast_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: bitcast_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: bitcast_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%load = load volatile <16 x i32>, <16 x i32>* %a
%cast = bitcast <16 x i32> %load to <16 x float>
store volatile <16 x float> %cast, <16 x float>* %b
ret void
}
-define void @bitcast_v32i32(<32 x i32> *%a, <32 x float>* %b) #0 {
+define void @bitcast_v32i32(<32 x i32> *%a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: bitcast_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <32 x i32>, <32 x i32>* %a
%cast = bitcast <32 x i32> %load to <32 x float>
store volatile <32 x float> %cast, <32 x float>* %b
ret void
}
-define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) #0 {
+define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: bitcast_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <64 x i32>, <64 x i32>* %a
%cast = bitcast <64 x i32> %load to <64 x float>
store volatile <64 x float> %cast, <64 x float>* %b
@@ -165,11 +182,12 @@ define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) #0 {
+define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v1i64:
-; CHECK: ldr d0, [x0]
-; CHECK-NEXT: str d0, [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: str d0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <1 x i64>, <1 x i64>* %a
%cast = bitcast <1 x i64> %load to <1 x double>
store volatile <1 x double> %cast, <1 x double>* %b
@@ -177,23 +195,25 @@ define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
-define void @bitcast_v2i64(<2 x i64> *%a, <2 x double>* %b) #0 {
+define void @bitcast_v2i64(<2 x i64> *%a, <2 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v2i64:
-; CHECK: ldr q0, [x0]
-; CHECK-NEXT: str q0, [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: str q0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <2 x i64>, <2 x i64>* %a
%cast = bitcast <2 x i64> %load to <2 x double>
store volatile <2 x double> %cast, <2 x double>* %b
ret void
}
-define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) #0 {
+define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <4 x i64>, <4 x i64>* %a
%cast = bitcast <4 x i64> %load to <4 x double>
store volatile <4 x double> %cast, <4 x double>* %b
@@ -201,35 +221,48 @@ define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) #0 {
}
define void @bitcast_v8i64(<8 x i64> *%a, <8 x double>* %b) #0 {
-; CHECK-LABEL: bitcast_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: bitcast_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: bitcast_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%load = load volatile <8 x i64>, <8 x i64>* %a
%cast = bitcast <8 x i64> %load to <8 x double>
store volatile <8 x double> %cast, <8 x double>* %b
ret void
}
-define void @bitcast_v16i64(<16 x i64> *%a, <16 x double>* %b) #0 {
+define void @bitcast_v16i64(<16 x i64> *%a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: bitcast_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <16 x i64>, <16 x i64>* %a
%cast = bitcast <16 x i64> %load to <16 x double>
store volatile <16 x double> %cast, <16 x double>* %b
ret void
}
-define void @bitcast_v32i64(<32 x i64> *%a, <32 x double>* %b) #0 {
+define void @bitcast_v32i64(<32 x i64> *%a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: bitcast_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%load = load volatile <32 x i64>, <32 x i64>* %a
%cast = bitcast <32 x i64> %load to <32 x double>
store volatile <32 x double> %cast, <32 x double>* %b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
index becdc912d1fb..50b0be2601cf 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
@@ -1,57 +1,47 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; i8
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) #0 {
+define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v8i8:
-; CHECK: uzp1 v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = shufflevector <4 x i8> %op1, <4 x i8> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v16i8:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %res
}
-define void @concat_v32i8(<16 x i8>* %a, <16 x i8>* %b, <32 x i8>* %c) #0 {
+define void @concat_v32i8(<16 x i8>* %a, <16 x i8>* %b, <32 x i8>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v32i8:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].b, vl16
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].b, [[PG1]], z[[OP1]].b, z[[OP2]].b
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].b, vl32
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: st1b { z1.b }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <16 x i8>, <16 x i8>* %a
%op2 = load <16 x i8>, <16 x i8>* %b
%res = shufflevector <16 x i8> %op1, <16 x i8> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -63,14 +53,25 @@ define void @concat_v32i8(<16 x i8>* %a, <16 x i8>* %b, <32 x i8>* %c) #0 {
}
define void @concat_v64i8(<32 x i8>* %a, <32 x i8>* %b, <64 x i8>* %c) #0 {
-; CHECK-LABEL: concat_v64i8:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].b, [[PG1]], [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2, x8]
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: concat_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl32
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: splice z0.b, p0, z0.b, z1.b
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = shufflevector <32 x i8> %op1, <32 x i8> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -85,15 +86,16 @@ define void @concat_v64i8(<32 x i8>* %a, <32 x i8>* %b, <64 x i8>* %c) #0 {
ret void
}
-define void @concat_v128i8(<64 x i8>* %a, <64 x i8>* %b, <128 x i8>* %c) #0 {
+define void @concat_v128i8(<64 x i8>* %a, <64 x i8>* %b, <128 x i8>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: concat_v128i8:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].b, [[PG1]], [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: st1b { z0.b }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = shufflevector <64 x i8> %op1, <64 x i8> %op2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -116,15 +118,16 @@ define void @concat_v128i8(<64 x i8>* %a, <64 x i8>* %b, <128 x i8>* %c) #0 {
ret void
}
-define void @concat_v256i8(<128 x i8>* %a, <128 x i8>* %b, <256 x i8>* %c) #0 {
+define void @concat_v256i8(<128 x i8>* %a, <128 x i8>* %b, <256 x i8>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: concat_v256i8:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].b, [[PG1]], [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: st1b { z0.b }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = shufflevector <128 x i8> %op1, <128 x i8> %op2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -168,32 +171,37 @@ define void @concat_v256i8(<128 x i8>* %a, <128 x i8>* %b, <256 x i8>* %c) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) #0 {
+define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v4i16:
-; CHECK: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = shufflevector <2 x i16> %op1, <2 x i16> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v8i16:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ret
%res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %res
}
-define void @concat_v16i16(<8 x i16>* %a, <8 x i16>* %b, <16 x i16>* %c) #0 {
+define void @concat_v16i16(<8 x i16>* %a, <8 x i16>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v16i16:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], z[[OP1]].h, z[[OP2]].h
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h, vl16
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: st1h { z1.h }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <8 x i16>, <8 x i16>* %a
%op2 = load <8 x i16>, <8 x i16>* %b
%res = shufflevector <8 x i16> %op1, <8 x i16> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -203,14 +211,25 @@ define void @concat_v16i16(<8 x i16>* %a, <8 x i16>* %b, <16 x i16>* %c) #0 {
}
define void @concat_v32i16(<16 x i16>* %a, <16 x i16>* %b, <32 x i16>* %c) #0 {
-; CHECK-LABEL: concat_v32i16:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: concat_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: splice z0.h, p0, z0.h, z1.h
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = shufflevector <16 x i16> %op1, <16 x i16> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -221,15 +240,16 @@ define void @concat_v32i16(<16 x i16>* %a, <16 x i16>* %b, <32 x i16>* %c) #0 {
ret void
}
-define void @concat_v64i16(<32 x i16>* %a, <32 x i16>* %b, <64 x i16>* %c) #0 {
+define void @concat_v64i16(<32 x i16>* %a, <32 x i16>* %b, <64 x i16>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: concat_v64i16:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: st1h { z0.h }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = shufflevector <32 x i16> %op1, <32 x i16> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -244,15 +264,16 @@ define void @concat_v64i16(<32 x i16>* %a, <32 x i16>* %b, <64 x i16>* %c) #0 {
ret void
}
-define void @concat_v128i16(<64 x i16>* %a, <64 x i16>* %b, <128 x i16>* %c) #0 {
+define void @concat_v128i16(<64 x i16>* %a, <64 x i16>* %b, <128 x i16>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: concat_v128i16:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: st1h { z0.h }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = shufflevector <64 x i16> %op1, <64 x i16> %op2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -280,32 +301,37 @@ define void @concat_v128i16(<64 x i16>* %a, <64 x i16>* %b, <128 x i16>* %c) #0
;
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2) #0 {
+define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v2i32:
-; CHECK: zip1 v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> <i32 0, i32 1>
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v4i32:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ret
%res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i32> %res
}
-define void @concat_v8i32(<4 x i32>* %a, <4 x i32>* %b, <8 x i32>* %c) #0 {
+define void @concat_v8i32(<4 x i32>* %a, <4 x i32>* %b, <8 x i32>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v8i32:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], z[[OP1]].s, z[[OP2]].s
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: st1w { z1.s }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <4 x i32>, <4 x i32>* %a
%op2 = load <4 x i32>, <4 x i32>* %b
%res = shufflevector <4 x i32> %op1, <4 x i32> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -314,14 +340,25 @@ define void @concat_v8i32(<4 x i32>* %a, <4 x i32>* %b, <8 x i32>* %c) #0 {
}
define void @concat_v16i32(<8 x i32>* %a, <8 x i32>* %b, <16 x i32>* %c) #0 {
-; CHECK-LABEL: concat_v16i32:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: concat_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: splice z0.s, p0, z0.s, z1.s
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = shufflevector <8 x i32> %op1, <8 x i32> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -330,15 +367,16 @@ define void @concat_v16i32(<8 x i32>* %a, <8 x i32>* %b, <16 x i32>* %c) #0 {
ret void
}
-define void @concat_v32i32(<16 x i32>* %a, <16 x i32>* %b, <32 x i32>* %c) #0 {
+define void @concat_v32i32(<16 x i32>* %a, <16 x i32>* %b, <32 x i32>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: concat_v32i32:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = shufflevector <16 x i32> %op1, <16 x i32> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -349,15 +387,16 @@ define void @concat_v32i32(<16 x i32>* %a, <16 x i32>* %b, <32 x i32>* %c) #0 {
ret void
}
-define void @concat_v64i32(<32 x i32>* %a, <32 x i32>* %b, <64 x i32>* %c) #0 {
+define void @concat_v64i32(<32 x i32>* %a, <32 x i32>* %b, <64 x i32>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: concat_v64i32:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl32
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = shufflevector <32 x i32> %op1, <32 x i32> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -377,23 +416,27 @@ define void @concat_v64i32(<32 x i32>* %a, <32 x i32>* %b, <64 x i32>* %c) #0 {
;
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v2i64:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ret
%res = shufflevector <1 x i64> %op1, <1 x i64> %op2, <2 x i32> <i32 0, i32 1>
ret <2 x i64> %res
}
-define void @concat_v4i64(<2 x i64>* %a, <2 x i64>* %b, <4 x i64>* %c) #0 {
+define void @concat_v4i64(<2 x i64>* %a, <2 x i64>* %b, <4 x i64>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v4i64:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].d, vl2
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], z[[OP1]].d, z[[OP2]].d
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d, vl4
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: st1d { z1.d }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <2 x i64>, <2 x i64>* %a
%op2 = load <2 x i64>, <2 x i64>* %b
%res = shufflevector <2 x i64> %op1, <2 x i64> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -402,14 +445,25 @@ define void @concat_v4i64(<2 x i64>* %a, <2 x i64>* %b, <4 x i64>* %c) #0 {
}
define void @concat_v8i64(<4 x i64>* %a, <4 x i64>* %b, <8 x i64>* %c) #0 {
-; CHECK-LABEL: concat_v8i64:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: concat_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl4
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: splice z0.d, p0, z0.d, z1.d
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
+; VBITS_GE_512-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = shufflevector <4 x i64> %op1, <4 x i64> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -417,15 +471,16 @@ define void @concat_v8i64(<4 x i64>* %a, <4 x i64>* %b, <8 x i64>* %c) #0 {
ret void
}
-define void @concat_v16i64(<8 x i64>* %a, <8 x i64>* %b, <16 x i64>* %c) #0 {
+define void @concat_v16i64(<8 x i64>* %a, <8 x i64>* %b, <16 x i64>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: concat_v16i64:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl8
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl8
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: st1d { z0.d }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = shufflevector <8 x i64> %op1, <8 x i64> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -434,15 +489,16 @@ define void @concat_v16i64(<8 x i64>* %a, <8 x i64>* %b, <16 x i64>* %c) #0 {
ret void
}
-define void @concat_v32i64(<16 x i64>* %a, <16 x i64>* %b, <32 x i64>* %c) #0 {
+define void @concat_v32i64(<16 x i64>* %a, <16 x i64>* %b, <32 x i64>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: concat_v32i64:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl16
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: st1d { z0.d }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = shufflevector <16 x i64> %op1, <16 x i64> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -458,32 +514,37 @@ define void @concat_v32i64(<16 x i64>* %a, <16 x i64>* %b, <32 x i64>* %c) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) #0 {
+define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v4f16:
-; CHECK: zip1 v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v8f16:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ret
%res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x half> %res
}
-define void @concat_v16f16(<8 x half>* %a, <8 x half>* %b, <16 x half>* %c) #0 {
+define void @concat_v16f16(<8 x half>* %a, <8 x half>* %b, <16 x half>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v16f16:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], z[[OP1]].h, z[[OP2]].h
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h, vl16
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: st1h { z1.h }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <8 x half>, <8 x half>* %a
%op2 = load <8 x half>, <8 x half>* %b
%res = shufflevector <8 x half> %op1, <8 x half> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -493,14 +554,25 @@ define void @concat_v16f16(<8 x half>* %a, <8 x half>* %b, <16 x half>* %c) #0 {
}
define void @concat_v32f16(<16 x half>* %a, <16 x half>* %b, <32 x half>* %c) #0 {
-; CHECK-LABEL: concat_v32f16:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: concat_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: splice z0.h, p0, z0.h, z1.h
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%op2 = load <16 x half>, <16 x half>* %b
%res = shufflevector <16 x half> %op1, <16 x half> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -511,15 +583,16 @@ define void @concat_v32f16(<16 x half>* %a, <16 x half>* %b, <32 x half>* %c) #0
ret void
}
-define void @concat_v64f16(<32 x half>* %a, <32 x half>* %b, <64 x half>* %c) #0 {
+define void @concat_v64f16(<32 x half>* %a, <32 x half>* %b, <64 x half>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: concat_v64f16:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: st1h { z0.h }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%res = shufflevector <32 x half> %op1, <32 x half> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -534,15 +607,16 @@ define void @concat_v64f16(<32 x half>* %a, <32 x half>* %b, <64 x half>* %c) #0
ret void
}
-define void @concat_v128f16(<64 x half>* %a, <64 x half>* %b, <128 x half>* %c) #0 {
+define void @concat_v128f16(<64 x half>* %a, <64 x half>* %b, <128 x half>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: concat_v128f16:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: st1h { z0.h }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%res = shufflevector <64 x half> %op1, <64 x half> %op2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -570,32 +644,37 @@ define void @concat_v128f16(<64 x half>* %a, <64 x half>* %b, <128 x half>* %c)
;
; Don't use SVE for 64-bit vectors.
-define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2) #0 {
+define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v2f32:
-; CHECK: zip1 v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> <i32 0, i32 1>
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v4f32:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ret
%res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x float> %res
}
-define void @concat_v8f32(<4 x float>* %a, <4 x float>* %b, <8 x float>* %c) #0 {
+define void @concat_v8f32(<4 x float>* %a, <4 x float>* %b, <8 x float>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v8f32:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], z[[OP1]].s, z[[OP2]].s
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: st1w { z1.s }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <4 x float>, <4 x float>* %a
%op2 = load <4 x float>, <4 x float>* %b
%res = shufflevector <4 x float> %op1, <4 x float> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -604,14 +683,25 @@ define void @concat_v8f32(<4 x float>* %a, <4 x float>* %b, <8 x float>* %c) #0
}
define void @concat_v16f32(<8 x float>* %a, <8 x float>* %b, <16 x float>* %c) #0 {
-; CHECK-LABEL: concat_v16f32:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: concat_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: splice z0.s, p0, z0.s, z1.s
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%op2 = load <8 x float>, <8 x float>* %b
%res = shufflevector <8 x float> %op1, <8 x float> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -620,15 +710,16 @@ define void @concat_v16f32(<8 x float>* %a, <8 x float>* %b, <16 x float>* %c) #
ret void
}
-define void @concat_v32f32(<16 x float>* %a, <16 x float>* %b, <32 x float>* %c) #0 {
+define void @concat_v32f32(<16 x float>* %a, <16 x float>* %b, <32 x float>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: concat_v32f32:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%res = shufflevector <16 x float> %op1, <16 x float> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -639,15 +730,16 @@ define void @concat_v32f32(<16 x float>* %a, <16 x float>* %b, <32 x float>* %c)
ret void
}
-define void @concat_v64f32(<32 x float>* %a, <32 x float>* %b, <64 x float>* %c) #0 {
+define void @concat_v64f32(<32 x float>* %a, <32 x float>* %b, <64 x float>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: concat_v64f32:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl32
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%res = shufflevector <32 x float> %op1, <32 x float> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -667,23 +759,27 @@ define void @concat_v64f32(<32 x float>* %a, <32 x float>* %b, <64 x float>* %c)
;
; Don't use SVE for 128-bit vectors.
-define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v2f64:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ret
%res = shufflevector <1 x double> %op1, <1 x double> %op2, <2 x i32> <i32 0, i32 1>
ret <2 x double> %res
}
-define void @concat_v4f64(<2 x double>* %a, <2 x double>* %b, <4 x double>* %c) #0 {
+define void @concat_v4f64(<2 x double>* %a, <2 x double>* %b, <4 x double>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v4f64:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].d, vl2
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], z[[OP1]].d, z[[OP2]].d
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d, vl4
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: st1d { z1.d }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <2 x double>, <2 x double>* %a
%op2 = load <2 x double>, <2 x double>* %b
%res = shufflevector <2 x double> %op1, <2 x double> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -692,14 +788,25 @@ define void @concat_v4f64(<2 x double>* %a, <2 x double>* %b, <4 x double>* %c)
}
define void @concat_v8f64(<4 x double>* %a, <4 x double>* %b, <8 x double>* %c) #0 {
-; CHECK-LABEL: concat_v8f64:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: concat_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl4
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: splice z0.d, p0, z0.d, z1.d
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
+; VBITS_GE_512-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
%res = shufflevector <4 x double> %op1, <4 x double> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -707,15 +814,16 @@ define void @concat_v8f64(<4 x double>* %a, <4 x double>* %b, <8 x double>* %c)
ret void
}
-define void @concat_v16f64(<8 x double>* %a, <8 x double>* %b, <16 x double>* %c) #0 {
+define void @concat_v16f64(<8 x double>* %a, <8 x double>* %b, <16 x double>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: concat_v16f64:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl8
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl8
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: st1d { z0.d }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%res = shufflevector <8 x double> %op1, <8 x double> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -724,15 +832,16 @@ define void @concat_v16f64(<8 x double>* %a, <8 x double>* %b, <16 x double>* %c
ret void
}
-define void @concat_v32f64(<16 x double>* %a, <16 x double>* %b, <32 x double>* %c) #0 {
+define void @concat_v32f64(<16 x double>* %a, <16 x double>* %b, <32 x double>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: concat_v32f64:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl16
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: st1d { z0.d }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%res = shufflevector <16 x double> %op1, <16 x double> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -747,12 +856,13 @@ define void @concat_v32f64(<16 x double>* %a, <16 x double>* %b, <32 x double>*
; undef
;
-define void @concat_v32i8_undef(<16 x i8>* %a, <32 x i8>* %b) #0 {
+define void @concat_v32i8_undef(<16 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v32i8_undef:
-; CHECK: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: st1b { z[[OP1]].b }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x i8>, <16 x i8>* %a
%res = shufflevector <16 x i8> %op1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
@@ -762,12 +872,13 @@ define void @concat_v32i8_undef(<16 x i8>* %a, <32 x i8>* %b) #0 {
ret void
}
-define void @concat_v16i16_undef(<8 x i16>* %a, <16 x i16>* %b) #0 {
+define void @concat_v16i16_undef(<8 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v16i16_undef:
-; CHECK: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: st1h { z[[OP1]].h }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <8 x i16>, <8 x i16>* %a
%res = shufflevector <8 x i16> %op1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -775,24 +886,26 @@ define void @concat_v16i16_undef(<8 x i16>* %a, <16 x i16>* %b) #0 {
ret void
}
-define void @concat_v8i32_undef(<4 x i32>* %a, <8 x i32>* %b) #0 {
+define void @concat_v8i32_undef(<4 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v8i32_undef:
-; CHECK: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: st1w { z[[OP1]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <4 x i32>, <4 x i32>* %a
%res = shufflevector <4 x i32> %op1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <8 x i32> %res, <8 x i32>* %b
ret void
}
-define void @concat_v4i64_undef(<2 x i64>* %a, <4 x i64>* %b) #0 {
+define void @concat_v4i64_undef(<2 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v4i64_undef:
-; CHECK: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: st1d { z[[OP1]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <2 x i64>, <2 x i64>* %a
%res = shufflevector <2 x i64> %op1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
store <4 x i64> %res, <4 x i64>* %b
@@ -803,12 +916,13 @@ define void @concat_v4i64_undef(<2 x i64>* %a, <4 x i64>* %b) #0 {
; > 2 operands
;
-define void @concat_v32i8_4op(<8 x i8>* %a, <32 x i8>* %b) #0 {
+define void @concat_v32i8_4op(<8 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v32i8_4op:
-; CHECK: ldr d[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: st1b { z[[OP1]].b }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <8 x i8>, <8 x i8>* %a
%shuffle = shufflevector <8 x i8> %op1, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -820,12 +934,13 @@ define void @concat_v32i8_4op(<8 x i8>* %a, <32 x i8>* %b) #0 {
ret void
}
-define void @concat_v16i16_4op(<4 x i16>* %a, <16 x i16>* %b) #0 {
+define void @concat_v16i16_4op(<4 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v16i16_4op:
-; CHECK: ldr d[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: st1h { z[[OP1]].h }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <4 x i16>, <4 x i16>* %a
%shuffle = shufflevector <4 x i16> %op1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%res = shufflevector <8 x i16> %shuffle, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -834,12 +949,13 @@ define void @concat_v16i16_4op(<4 x i16>* %a, <16 x i16>* %b) #0 {
ret void
}
-define void @concat_v8i32_4op(<2 x i32>* %a, <8 x i32>* %b) #0 {
+define void @concat_v8i32_4op(<2 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v8i32_4op:
-; CHECK: ldr d[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: st1w { z[[OP1]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <2 x i32>, <2 x i32>* %a
%shuffle = shufflevector <2 x i32> %op1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%res = shufflevector <4 x i32> %shuffle, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -847,12 +963,13 @@ define void @concat_v8i32_4op(<2 x i32>* %a, <8 x i32>* %b) #0 {
ret void
}
-define void @concat_v4i64_4op(<1 x i64>* %a, <4 x i64>* %b) #0 {
+define void @concat_v4i64_4op(<1 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: concat_v4i64_4op:
-; CHECK: ldr d[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: st1d { z[[OP1]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <1 x i64>, <1 x i64>* %a
%shuffle = shufflevector <1 x i64> %op1, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
%res = shufflevector <2 x i64> %shuffle, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
index 9c1ee54d2665..fde767ac4014 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
@@ -1,25 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu"
-define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) #0 {
+define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) vscale_range(2,0) #0 {
; CHECK-LABEL: load_zext_v4i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -49,7 +34,7 @@ define <2 x i256> @load_zext_v2i64i256(<2 x i64>* %ap) #0 {
ret <2 x i256> %val
}
-define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 {
+define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) vscale_range(2,0) #0 {
; CHECK-LABEL: load_zext_v8i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -61,103 +46,43 @@ define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 {
ret <8 x i32> %val
}
-define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_zext_v16i16i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: load_zext_v16i16i32:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_512-NEXT: ret
-
- ; Ensure sensible type legalistaion
+define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) vscale_range(4,0) #0 {
+; CHECK-LABEL: load_zext_v16i16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %ap
%val = zext <16 x i16> %a to <16 x i32>
ret <16 x i32> %val
}
-define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_zext_v32i16i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: uunpklo z2.s, z0.h
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: load_zext_v32i16i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_1024-NEXT: ret
+define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) vscale_range(8,0) #0 {
+; CHECK-LABEL: load_zext_v32i16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap
%val = zext <32 x i16> %a to <32 x i32>
ret <32 x i32> %val
}
define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_zext_v64i16i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #32
-; VBITS_GE_256-NEXT: mov x11, #48
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x12, #24
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: uunpklo z4.s, z0.h
-; VBITS_GE_256-NEXT: uunpklo z5.s, z1.h
-; VBITS_GE_256-NEXT: uunpklo z6.s, z2.h
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT: mov x10, #56
-; VBITS_GE_256-NEXT: mov x11, #40
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT: uunpklo z7.s, z3.h
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
+; VBITS_GE_1024-LABEL: load_zext_v64i16i32:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
+; VBITS_GE_1024-NEXT: mov x9, #32
+; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT: uunpklo z1.s, z0.h
+; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
+; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_zext_v64i16i32:
; VBITS_GE_2048: // %bb.0:
@@ -170,7 +95,7 @@ define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 {
ret <64 x i32> %val
}
-define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 {
+define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) vscale_range(2,0) #0 {
; CHECK-LABEL: load_sext_v4i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -181,7 +106,7 @@ define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 {
ret <4 x i32> %val
}
-define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 {
+define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) vscale_range(2,0) #0 {
; CHECK-LABEL: load_sext_v8i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -193,103 +118,43 @@ define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 {
ret <8 x i32> %val
}
-define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_sext_v16i16i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: load_sext_v16i16i32:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_512-NEXT: ret
-
- ; Ensure sensible type legalistaion
+define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) vscale_range(4,0) #0 {
+; CHECK-LABEL: load_sext_v16i16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %ap
%val = sext <16 x i16> %a to <16 x i32>
ret <16 x i32> %val
}
-define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_sext_v32i16i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: sunpklo z2.s, z0.h
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: load_sext_v32i16i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_1024-NEXT: ret
+define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) vscale_range(8,0) #0 {
+; CHECK-LABEL: load_sext_v32i16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap
%val = sext <32 x i16> %a to <32 x i32>
ret <32 x i32> %val
}
define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_sext_v64i16i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #32
-; VBITS_GE_256-NEXT: mov x11, #48
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x12, #24
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: sunpklo z4.s, z0.h
-; VBITS_GE_256-NEXT: sunpklo z5.s, z1.h
-; VBITS_GE_256-NEXT: sunpklo z6.s, z2.h
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT: mov x10, #56
-; VBITS_GE_256-NEXT: mov x11, #40
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT: sunpklo z7.s, z3.h
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
+; VBITS_GE_1024-LABEL: load_sext_v64i16i32:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
+; VBITS_GE_1024-NEXT: mov x9, #32
+; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT: sunpklo z1.s, z0.h
+; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
+; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_sext_v64i16i32:
; VBITS_GE_2048: // %bb.0:
@@ -303,52 +168,22 @@ define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 {
}
define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_zext_v32i8i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ushll2 v2.8h, v0.16b, #0
-; VBITS_GE_256-NEXT: ushll v1.8h, v0.8b, #0
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: ext v3.16b, v2.16b, v2.16b, #8
-; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: ushll2 v4.8h, v0.16b, #0
-; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: uunpklo z2.s, z4.h
-; VBITS_GE_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8
-; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x10, #28
-; VBITS_GE_256-NEXT: uunpklo z2.d, z3.s
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: ext v2.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT: uunpklo z3.s, z4.h
-; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
-; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: uunpklo z0.d, z2.s
-; VBITS_GE_256-NEXT: mov x9, #20
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: uunpklo z0.s, z1.h
-; VBITS_GE_256-NEXT: uunpklo z1.s, z3.h
-; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
+; VBITS_GE_1024-LABEL: load_zext_v32i8i64:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.b, vl32
+; VBITS_GE_1024-NEXT: mov x9, #16
+; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT: uunpklo z1.h, z0.b
+; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_1024-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_1024-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s
+; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_zext_v32i8i64:
; VBITS_GE_2048: // %bb.0:
@@ -362,52 +197,22 @@ define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 {
}
define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_sext_v32i8i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: sshll2 v2.8h, v0.16b, #0
-; VBITS_GE_256-NEXT: sshll v1.8h, v0.8b, #0
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: ext v3.16b, v2.16b, v2.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: sshll2 v4.8h, v0.16b, #0
-; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: sunpklo z2.s, z4.h
-; VBITS_GE_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x10, #28
-; VBITS_GE_256-NEXT: sunpklo z2.d, z3.s
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: ext v2.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z3.s, z4.h
-; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: sunpklo z0.d, z2.s
-; VBITS_GE_256-NEXT: mov x9, #20
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: sunpklo z0.s, z1.h
-; VBITS_GE_256-NEXT: sunpklo z1.s, z3.h
-; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
+; VBITS_GE_1024-LABEL: load_sext_v32i8i64:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.b, vl32
+; VBITS_GE_1024-NEXT: mov x9, #16
+; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT: sunpklo z1.h, z0.b
+; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_1024-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_1024-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s
+; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_sext_v32i8i64:
; VBITS_GE_2048: // %bb.0:
@@ -421,50 +226,20 @@ define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 {
}
define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_zext_v32i16i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #20
-; VBITS_GE_256-NEXT: uunpklo z4.s, z1.h
-; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: uunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: uunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: uunpklo z0.s, z3.h
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x10, #12
-; VBITS_GE_256-NEXT: uunpklo z2.s, z6.h
-; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: uunpklo z1.d, z4.s
-; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
+; VBITS_GE_1024-LABEL: load_zext_v32i16i64:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
+; VBITS_GE_1024-NEXT: mov x9, #16
+; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT: uunpklo z1.s, z0.h
+; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #32
+; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s
+; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_zext_v32i16i64:
; VBITS_GE_2048: // %bb.0:
@@ -478,50 +253,20 @@ define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 {
}
define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_sext_v32i16i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #20
-; VBITS_GE_256-NEXT: sunpklo z4.s, z1.h
-; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: sunpklo z0.s, z3.h
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x10, #12
-; VBITS_GE_256-NEXT: sunpklo z2.s, z6.h
-; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: sunpklo z1.d, z4.s
-; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
+; VBITS_GE_1024-LABEL: load_sext_v32i16i64:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
+; VBITS_GE_1024-NEXT: mov x9, #16
+; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT: sunpklo z1.s, z0.h
+; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #32
+; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s
+; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_sext_v32i16i64:
; VBITS_GE_2048: // %bb.0:
@@ -535,42 +280,18 @@ define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 {
}
define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_zext_v32i32i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x11, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x12, #12
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: uunpklo z4.d, z0.s
-; VBITS_GE_256-NEXT: uunpklo z5.d, z1.s
-; VBITS_GE_256-NEXT: uunpklo z6.d, z2.s
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x8, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x10, #28
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT: uunpklo z7.d, z3.s
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
+; VBITS_GE_1024-LABEL: load_zext_v32i32i64:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT: mov x9, #16
+; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT: uunpklo z1.d, z0.s
+; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
+; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_zext_v32i32i64:
; VBITS_GE_2048: // %bb.0:
@@ -584,42 +305,18 @@ define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 {
}
define <32 x i64> @load_sext_v32i32i64(<32 x i32>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_sext_v32i32i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x11, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x12, #12
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: sunpklo z4.d, z0.s
-; VBITS_GE_256-NEXT: sunpklo z5.d, z1.s
-; VBITS_GE_256-NEXT: sunpklo z6.d, z2.s
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x8, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x10, #28
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT: sunpklo z7.d, z3.s
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
+; VBITS_GE_1024-LABEL: load_sext_v32i32i64:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT: mov x9, #16
+; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT: sunpklo z1.d, z0.s
+; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
+; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_sext_v32i32i64:
; VBITS_GE_2048: // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
index f454be420905..402e270b5313 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
@@ -1,28 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; i8
; Don't use SVE for 64-bit vectors.
-define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) #0 {
+define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b
@@ -32,7 +18,7 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) #0 {
+define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@@ -42,7 +28,7 @@ define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) #0 {
ret <8 x i8> %ret
}
-define void @extract_subvector_v32i8(<32 x i8>* %a, <16 x i8>* %b) #0 {
+define void @extract_subvector_v32i8(<32 x i8>* %a, <16 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
@@ -79,62 +65,30 @@ define void @extract_subvector_v64i8(<64 x i8>* %a, <32 x i8>* %b) #0 {
ret void
}
-define void @extract_subvector_v128i8(<128 x i8>* %a, <64 x i8>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v128i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w8, #64
-; VBITS_GE_256-NEXT: mov w9, #96
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
-; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9]
-; VBITS_GE_256-NEXT: mov w8, #32
-; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1, x8]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v128i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.b, vl64
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @extract_subvector_v128i8(<128 x i8>* %a, <64 x i8>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v128i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%ret = call <64 x i8> @llvm.experimental.vector.extract.v64i8.v128i8(<128 x i8> %op, i64 64)
store <64 x i8> %ret, <64 x i8>* %b
ret void
}
-define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v256i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w8, #128
-; VBITS_GE_256-NEXT: mov w9, #160
-; VBITS_GE_256-NEXT: mov w10, #224
-; VBITS_GE_256-NEXT: mov w11, #192
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
-; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9]
-; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10]
-; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x11]
-; VBITS_GE_256-NEXT: mov w8, #64
-; VBITS_GE_256-NEXT: mov w9, #96
-; VBITS_GE_256-NEXT: mov w10, #32
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
-; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x1, x8]
-; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x1, x9]
-; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1, x10]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v256i8:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.b, vl128
-; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: extract_subvector_v256i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%ret = call <128 x i8> @llvm.experimental.vector.extract.v128i8.v256i8(<256 x i8> %op, i64 128)
store <128 x i8> %ret, <128 x i8>* %b
@@ -144,7 +98,7 @@ define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) #0 {
; i16
; Don't use SVE for 64-bit vectors.
-define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) #0 {
+define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -159,7 +113,7 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) #0 {
+define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@@ -169,7 +123,7 @@ define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) #0 {
ret <4 x i16> %ret
}
-define void @extract_subvector_v16i16(<16 x i16>* %a, <8 x i16>* %b) #0 {
+define void @extract_subvector_v16i16(<16 x i16>* %a, <8 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -206,62 +160,30 @@ define void @extract_subvector_v32i16(<32 x i16>* %a, <16 x i16>* %b) #0 {
ret void
}
-define void @extract_subvector_v64i16(<64 x i16>* %a, <32 x i16>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v64i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #32
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v64i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @extract_subvector_v64i16(<64 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v64i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%ret = call <32 x i16> @llvm.experimental.vector.extract.v32i16.v64i16(<64 x i16> %op, i64 32)
store <32 x i16> %ret, <32 x i16>* %b
ret void
}
-define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v128i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #64
-; VBITS_GE_256-NEXT: mov x9, #80
-; VBITS_GE_256-NEXT: mov x10, #112
-; VBITS_GE_256-NEXT: mov x11, #96
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: mov x8, #32
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v128i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: extract_subvector_v128i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%ret = call <64 x i16> @llvm.experimental.vector.extract.v64i16.v128i16(<128 x i16> %op, i64 64)
store <64 x i16> %ret, <64 x i16>* %b
@@ -271,7 +193,7 @@ define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) #0 {
; i32
; Don't use SVE for 64-bit vectors.
-define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) #0 {
+define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -282,7 +204,7 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) #0 {
+define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@@ -292,7 +214,7 @@ define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) #0 {
ret <2 x i32> %ret
}
-define void @extract_subvector_v8i32(<8 x i32>* %a, <4 x i32>* %b) #0 {
+define void @extract_subvector_v8i32(<8 x i32>* %a, <4 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -329,62 +251,30 @@ define void @extract_subvector_v16i32(<16 x i32>* %a, <8 x i32>* %b) #0 {
ret void
}
-define void @extract_subvector_v32i32(<32 x i32>* %a, <16 x i32>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v32i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v32i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @extract_subvector_v32i32(<32 x i32>* %a, <16 x i32>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%ret = call <16 x i32> @llvm.experimental.vector.extract.v16i32.v32i32(<32 x i32> %op, i64 16)
store <16 x i32> %ret, <16 x i32>* %b
ret void
}
-define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v64i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #32
-; VBITS_GE_256-NEXT: mov x9, #40
-; VBITS_GE_256-NEXT: mov x10, #56
-; VBITS_GE_256-NEXT: mov x11, #48
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v64i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: extract_subvector_v64i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%ret = call <32 x i32> @llvm.experimental.vector.extract.v32i32.v64i32(<64 x i32> %op, i64 32)
store <32 x i32> %ret, <32 x i32>* %b
@@ -394,7 +284,7 @@ define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) #0 {
; i64
; Don't use SVE for 128-bit vectors.
-define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) #0 {
+define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@@ -404,7 +294,7 @@ define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) #0 {
ret <1 x i64> %ret
}
-define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) #0 {
+define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -418,23 +308,14 @@ define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) #0 {
ret void
}
-define void @extract_subvector_v8i64(<8 x i64>* %a, <4 x i64>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v8i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: extract_subvector_v8i64:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ptrue p0.d, vl4
-; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_512-NEXT: ret
+define void @extract_subvector_v8i64(<8 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: extract_subvector_v8i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #4
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%ret = call <4 x i64> @llvm.experimental.vector.extract.v4i64.v8i64(<8 x i64> %op, i64 4)
store <4 x i64> %ret, <4 x i64>* %b
@@ -453,50 +334,20 @@ define void @extract_subvector_v16i64(<16 x i64>* %a, <8 x i64>* %b) #0 {
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v16i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl8
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%ret = call <8 x i64> @llvm.experimental.vector.extract.v8i64.v16i64(<16 x i64> %op, i64 8)
store <8 x i64> %ret, <8 x i64>* %b
ret void
}
-define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v32i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #20
-; VBITS_GE_256-NEXT: mov x10, #28
-; VBITS_GE_256-NEXT: mov x11, #24
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl16
-; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #16
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%ret = call <16 x i64> @llvm.experimental.vector.extract.v16i64.v32i64(<32 x i64> %op, i64 16)
store <16 x i64> %ret, <16 x i64>* %b
@@ -506,7 +357,7 @@ define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) #0 {
; f16
; Don't use SVE for 64-bit vectors.
-define <2 x half> @extract_subvector_v4f16(<4 x half> %op) #0 {
+define <2 x half> @extract_subvector_v4f16(<4 x half> %op) vscale_range(16,0) #0 {
; CHECK-LABEL: extract_subvector_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -517,7 +368,7 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x half> @extract_subvector_v8f16(<8 x half> %op) #0 {
+define <4 x half> @extract_subvector_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@@ -527,7 +378,7 @@ define <4 x half> @extract_subvector_v8f16(<8 x half> %op) #0 {
ret <4 x half> %ret
}
-define void @extract_subvector_v16f16(<16 x half>* %a, <8 x half>* %b) #0 {
+define void @extract_subvector_v16f16(<16 x half>* %a, <8 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -564,62 +415,30 @@ define void @extract_subvector_v32f16(<32 x half>* %a, <16 x half>* %b) #0 {
ret void
}
-define void @extract_subvector_v64f16(<64 x half>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v64f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #32
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v64f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @extract_subvector_v64f16(<64 x half>* %a, <32 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%ret = call <32 x half> @llvm.experimental.vector.extract.v32f16.v64f16(<64 x half> %op, i64 32)
store <32 x half> %ret, <32 x half>* %b
ret void
}
-define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v128f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #64
-; VBITS_GE_256-NEXT: mov x9, #80
-; VBITS_GE_256-NEXT: mov x10, #112
-; VBITS_GE_256-NEXT: mov x11, #96
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: mov x8, #32
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v128f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: extract_subvector_v128f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%ret = call <64 x half> @llvm.experimental.vector.extract.v64f16.v128f16(<128 x half> %op, i64 64)
store <64 x half> %ret, <64 x half>* %b
@@ -629,7 +448,7 @@ define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) #0 {
; f32
; Don't use SVE for 64-bit vectors.
-define <1 x float> @extract_subvector_v2f32(<2 x float> %op) #0 {
+define <1 x float> @extract_subvector_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -640,7 +459,7 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x float> @extract_subvector_v4f32(<4 x float> %op) #0 {
+define <2 x float> @extract_subvector_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@@ -650,7 +469,7 @@ define <2 x float> @extract_subvector_v4f32(<4 x float> %op) #0 {
ret <2 x float> %ret
}
-define void @extract_subvector_v8f32(<8 x float>* %a, <4 x float>* %b) #0 {
+define void @extract_subvector_v8f32(<8 x float>* %a, <4 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -687,62 +506,30 @@ define void @extract_subvector_v16f32(<16 x float>* %a, <8 x float>* %b) #0 {
ret void
}
-define void @extract_subvector_v32f32(<32 x float>* %a, <16 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @extract_subvector_v32f32(<32 x float>* %a, <16 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%ret = call <16 x float> @llvm.experimental.vector.extract.v16f32.v32f32(<32 x float> %op, i64 16)
store <16 x float> %ret, <16 x float>* %b
ret void
}
-define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #32
-; VBITS_GE_256-NEXT: mov x9, #40
-; VBITS_GE_256-NEXT: mov x10, #56
-; VBITS_GE_256-NEXT: mov x11, #48
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: extract_subvector_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%ret = call <32 x float> @llvm.experimental.vector.extract.v32f32.v64f32(<64 x float> %op, i64 32)
store <32 x float> %ret, <32 x float>* %b
@@ -752,7 +539,7 @@ define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) #0 {
; f64
; Don't use SVE for 128-bit vectors.
-define <1 x double> @extract_subvector_v2f64(<2 x double> %op) #0 {
+define <1 x double> @extract_subvector_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@@ -762,7 +549,7 @@ define <1 x double> @extract_subvector_v2f64(<2 x double> %op) #0 {
ret <1 x double> %ret
}
-define void @extract_subvector_v4f64(<4 x double>* %a, <2 x double>* %b) #0 {
+define void @extract_subvector_v4f64(<4 x double>* %a, <2 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -799,62 +586,30 @@ define void @extract_subvector_v8f64(<8 x double>* %a, <4 x double>* %b) #0 {
ret void
}
-define void @extract_subvector_v16f64(<16 x double>* %a, <8 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl8
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @extract_subvector_v16f64(<16 x double>* %a, <8 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl8
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%ret = call <8 x double> @llvm.experimental.vector.extract.v8f64.v16f64(<16 x double> %op, i64 8)
store <8 x double> %ret, <8 x double>* %b
ret void
}
-define void @extract_subvector_v32f64(<32 x double>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #20
-; VBITS_GE_256-NEXT: mov x10, #28
-; VBITS_GE_256-NEXT: mov x11, #24
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl16
-; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @extract_subvector_v32f64(<32 x double>* %a, <16 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: extract_subvector_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%ret = call <16 x double> @llvm.experimental.vector.extract.v16f64.v32f64(<32 x double> %op, i64 16)
store <16 x double> %ret, <16 x double>* %b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
index 800af021c8ce..d8de704a241e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
@@ -1,221 +1,259 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; extractelement
;
; Don't use SVE for 64-bit vectors.
-define half @extractelement_v4f16(<4 x half> %op1) #0 {
+define half @extractelement_v4f16(<4 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v4f16:
-; CHECK: mov h0, v0.h[3]
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov h0, v0.h[3]
; CHECK-NEXT: ret
%r = extractelement <4 x half> %op1, i64 3
ret half %r
}
; Don't use SVE for 128-bit vectors.
-define half @extractelement_v8f16(<8 x half> %op1) #0 {
+define half @extractelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v8f16:
-; CHECK: mov h0, v0.h[7]
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: ret
%r = extractelement <8 x half> %op1, i64 7
ret half %r
}
-define half @extractelement_v16f16(<16 x half>* %a) #0 {
+define half @extractelement_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v16f16:
-; VBITS_GE_256: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: mov z0.h, z0.h[15]
-; VBITS_GE_256-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: mov z0.h, z0.h[15]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%r = extractelement <16 x half> %op1, i64 15
ret half %r
}
define half @extractelement_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: extractelement_v32f16:
-; VBITS_GE_512: ptrue p0.h, vl32
-; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-LABEL: extractelement_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: mov z0.h, z0.h[15]
+; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: extractelement_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: mov z0.h, z0.h[31]
+; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0
; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%r = extractelement <32 x half> %op1, i64 31
ret half %r
}
-define half @extractelement_v64f16(<64 x half>* %a) #0 {
+define half @extractelement_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: extractelement_v64f16:
-; VBITS_GE_1024: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: mov w8, #63
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: whilels p0.h, xzr, x8
-; VBITS_GE_1024-NEXT: lastb h0, p0, z0.h
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: mov w8, #63
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: whilels p0.h, xzr, x8
+; CHECK-NEXT: lastb h0, p0, z0.h
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%r = extractelement <64 x half> %op1, i64 63
ret half %r
}
-define half @extractelement_v128f16(<128 x half>* %a) #0 {
+define half @extractelement_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: extractelement_v128f16:
-; VBITS_GE_2048: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: mov w8, #127
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: whilels p0.h, xzr, x8
-; VBITS_GE_2048-NEXT: lastb h0, p0, z0.h
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: mov w8, #127
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: whilels p0.h, xzr, x8
+; CHECK-NEXT: lastb h0, p0, z0.h
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%r = extractelement <128 x half> %op1, i64 127
ret half %r
}
; Don't use SVE for 64-bit vectors.
-define float @extractelement_v2f32(<2 x float> %op1) #0 {
+define float @extractelement_v2f32(<2 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v2f32:
-; CHECK: mov s0, v0.s[1]
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov s0, v0.s[1]
; CHECK-NEXT: ret
%r = extractelement <2 x float> %op1, i64 1
ret float %r
}
; Don't use SVE for 128-bit vectors.
-define float @extractelement_v4f32(<4 x float> %op1) #0 {
+define float @extractelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v4f32:
-; CHECK: mov s0, v0.s[3]
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov s0, v0.s[3]
; CHECK-NEXT: ret
%r = extractelement <4 x float> %op1, i64 3
ret float %r
}
-define float @extractelement_v8f32(<8 x float>* %a) #0 {
+define float @extractelement_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v8f32:
-; VBITS_GE_256: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: mov z0.s, z0.s[7]
-; VBITS_GE_256-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: mov z0.s, z0.s[7]
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%r = extractelement <8 x float> %op1, i64 7
ret float %r
}
define float @extractelement_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: extractelement_v16f32:
-; VBITS_GE_512: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-LABEL: extractelement_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: mov z0.s, z0.s[7]
+; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: extractelement_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: mov z0.s, z0.s[15]
+; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%r = extractelement <16 x float> %op1, i64 15
ret float %r
}
-define float @extractelement_v32f32(<32 x float>* %a) #0 {
+define float @extractelement_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: extractelement_v32f32:
-; VBITS_GE_1024: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: mov w8, #31
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: whilels p0.s, xzr, x8
-; VBITS_GE_1024-NEXT: lastb s0, p0, z0.s
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: mov w8, #31
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: whilels p0.s, xzr, x8
+; CHECK-NEXT: lastb s0, p0, z0.s
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%r = extractelement <32 x float> %op1, i64 31
ret float %r
}
-define float @extractelement_v64f32(<64 x float>* %a) #0 {
+define float @extractelement_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: extractelement_v64f32:
-; VBITS_GE_2048: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: mov w8, #63
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: whilels p0.s, xzr, x8
-; VBITS_GE_2048-NEXT: lastb s0, p0, z0.s
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: mov w8, #63
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: whilels p0.s, xzr, x8
+; CHECK-NEXT: lastb s0, p0, z0.s
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%r = extractelement <64 x float> %op1, i64 63
ret float %r
}
; Don't use SVE for 64-bit vectors.
-define double @extractelement_v1f64(<1 x double> %op1) #0 {
+define double @extractelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v1f64:
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
%r = extractelement <1 x double> %op1, i64 0
ret double %r
}
; Don't use SVE for 128-bit vectors.
-define double @extractelement_v2f64(<2 x double> %op1) #0 {
+define double @extractelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v2f64:
-; CHECK: mov d0, v0.d[1]
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov d0, v0.d[1]
; CHECK-NEXT: ret
%r = extractelement <2 x double> %op1, i64 1
ret double %r
}
-define double @extractelement_v4f64(<4 x double>* %a) #0 {
+define double @extractelement_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v4f64:
-; VBITS_GE_256: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: mov z0.d, z0.d[3]
-; VBITS_GE_256-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: mov z0.d, z0.d[3]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%r = extractelement <4 x double> %op1, i64 3
ret double %r
}
define double @extractelement_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: extractelement_v8f64:
-; VBITS_GE_512: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_256-LABEL: extractelement_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: mov z0.d, z0.d[3]
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: extractelement_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: mov z0.d, z0.d[7]
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%r = extractelement <8 x double> %op1, i64 7
ret double %r
}
-define double @extractelement_v16f64(<16 x double>* %a) #0 {
+define double @extractelement_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: extractelement_v16f64:
-; VBITS_GE_1024: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: mov w8, #15
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: whilels p0.d, xzr, x8
-; VBITS_GE_1024-NEXT: lastb d0, p0, z0.d
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: mov w8, #15
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: whilels p0.d, xzr, x8
+; CHECK-NEXT: lastb d0, p0, z0.d
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%r = extractelement <16 x double> %op1, i64 15
ret double %r
}
-define double @extractelement_v32f64(<32 x double>* %a) #0 {
+define double @extractelement_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: extractelement_v32f64:
-; VBITS_GE_2048: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: mov w8, #31
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: whilels p0.d, xzr, x8
-; VBITS_GE_2048-NEXT: lastb d0, p0, z0.d
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: mov w8, #31
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: whilels p0.d, xzr, x8
+; CHECK-NEXT: lastb d0, p0, z0.d
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%r = extractelement <32 x double> %op1, i64 31
ret double %r
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
index a390ccccd063..1d588c90b8ef 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
@@ -1,60 +1,43 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256
-
-; VBYTES represents the useful byte size of a vector register from the code
-; generator's point of view. It is clamped to power-of-2 values because
-; only power-of-2 vector lengths are considered legal, regardless of the
-; user specified vector length.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; FADD
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fadd_v4f16:
-; CHECK: fadd v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = fadd <4 x half> %op1, %op2
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fadd_v8f16:
-; CHECK: fadd v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = fadd <8 x half> %op1, %op2
ret <8 x half> %res
}
-define void @fadd_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fadd_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fadd_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%op2 = load <16 x half>, <16 x half>* %b
%res = fadd <16 x half> %op1, %op2
@@ -63,18 +46,28 @@ define void @fadd_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
}
define void @fadd_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fadd_v32f16:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-DAG: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-DAG: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_LE_256-DAG: mov x[[IDX_1:[0-9]+]], #[[#div(VBYTES,2)]]
-; VBITS_LE_256-DAG: ld1h { [[OP1_1:z[0-9]+]].h }, [[PG]]/z, [x0, x[[IDX_1]], lsl #1]
-; VBITS_LE_256-DAG: ld1h { [[OP2_1:z[0-9]+]].h }, [[PG]]/z, [x1, x[[IDX_1]], lsl #1]
-; VBITS_LE_256-DAG: fadd [[RES_1:z[0-9]+]].h, [[PG]]/m, [[OP1_1]].h, [[OP2_1]].h
-; VBITS_LE_256-DAG: st1h { [[RES_1]].h }, [[PG]], [x0, x[[IDX_1]], lsl #1]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fadd_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fadd z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: fadd z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fadd_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fadd z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%res = fadd <32 x half> %op1, %op2
@@ -82,29 +75,15 @@ define void @fadd_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
ret void
}
-define void @fadd_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fadd_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fadd_v64f16:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-DAG: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-DAG: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_LE_512-DAG: mov x[[IDX_1:[0-9]+]], #[[#div(VBYTES,2)]]
-; VBITS_LE_512-DAG: ld1h { [[OP1_1:z[0-9]+]].h }, [[PG]]/z, [x0, x[[IDX_1]], lsl #1]
-; VBITS_LE_512-DAG: ld1h { [[OP2_1:z[0-9]+]].h }, [[PG]]/z, [x1, x[[IDX_1]], lsl #1]
-; VBITS_LE_512-DAG: fadd [[RES_1:z[0-9]+]].h, [[PG]]/m, [[OP1_1]].h, [[OP2_1]].h
-; VBITS_LE_512-DAG: st1h { [[RES_1]].h }, [[PG]], [x0, x[[IDX_1]], lsl #1]
-; VBITS_LE_256-DAG: mov x[[IDX_2:[0-9]+]], #[[#mul(div(VBYTES,2),2)]]
-; VBITS_LE_256-DAG: ld1h { [[OP1_2:z[0-9]+]].h }, [[PG]]/z, [x0, x[[IDX_2]], lsl #1]
-; VBITS_LE_256-DAG: ld1h { [[OP2_2:z[0-9]+]].h }, [[PG]]/z, [x1, x[[IDX_2]], lsl #1]
-; VBITS_LE_256-DAG: fadd [[RES_2:z[0-9]+]].h, [[PG]]/m, [[OP1_2]].h, [[OP2_2]].h
-; VBITS_LE_256-DAG: st1h { [[RES_2]].h }, [[PG]], [x0, x[[IDX_2]], lsl #1]
-; VBITS_LE_256-DAG: mov x[[IDX_3:[0-9]+]], #[[#mul(div(VBYTES,2),3)]]
-; VBITS_LE_256-DAG: ld1h { [[OP1_3:z[0-9]+]].h }, [[PG]]/z, [x0, x[[IDX_3]], lsl #1]
-; VBITS_LE_256-DAG: ld1h { [[OP2_3:z[0-9]+]].h }, [[PG]]/z, [x1, x[[IDX_3]], lsl #1]
-; VBITS_LE_256-DAG: fadd [[RES_3:z[0-9]+]].h, [[PG]]/m, [[OP1_3]].h, [[OP2_3]].h
-; VBITS_LE_256-DAG: st1h { [[RES_3]].h }, [[PG]], [x0, x[[IDX_3]], lsl #1]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%res = fadd <64 x half> %op1, %op2
@@ -112,16 +91,15 @@ define void @fadd_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
-define void @fadd_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fadd_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fadd_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%res = fadd <128 x half> %op1, %op2
@@ -130,31 +108,34 @@ define void @fadd_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fadd_v2f32:
-; CHECK: fadd v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = fadd <2 x float> %op1, %op2
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fadd_v4f32:
-; CHECK: fadd v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = fadd <4 x float> %op1, %op2
ret <4 x float> %res
}
-define void @fadd_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fadd_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fadd_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%op2 = load <8 x float>, <8 x float>* %b
%res = fadd <8 x float> %op1, %op2
@@ -162,16 +143,29 @@ define void @fadd_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
define void @fadd_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fadd_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fadd_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fadd z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: fadd z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fadd_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%res = fadd <16 x float> %op1, %op2
@@ -179,16 +173,15 @@ define void @fadd_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
-define void @fadd_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fadd_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fadd_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%res = fadd <32 x float> %op1, %op2
@@ -196,16 +189,15 @@ define void @fadd_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
-define void @fadd_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fadd_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fadd_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%res = fadd <64 x float> %op1, %op2
@@ -214,31 +206,34 @@ define void @fadd_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fadd_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fadd_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fadd_v1f64:
-; CHECK: fadd d0, d0, d1
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fadd d0, d0, d1
+; CHECK-NEXT: ret
%res = fadd <1 x double> %op1, %op2
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fadd_v2f64:
-; CHECK: fadd v0.2d, v0.2d, v1.2d
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = fadd <2 x double> %op1, %op2
ret <2 x double> %res
}
-define void @fadd_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fadd_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fadd_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
%res = fadd <4 x double> %op1, %op2
@@ -246,16 +241,29 @@ define void @fadd_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
define void @fadd_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fadd_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fadd_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fadd z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: fadd z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fadd_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fadd z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%res = fadd <8 x double> %op1, %op2
@@ -263,16 +271,15 @@ define void @fadd_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
-define void @fadd_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fadd_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fadd_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%res = fadd <16 x double> %op1, %op2
@@ -280,16 +287,15 @@ define void @fadd_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
-define void @fadd_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fadd_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fadd_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%res = fadd <32 x double> %op1, %op2
@@ -297,41 +303,39 @@ define void @fadd_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
ret void
}
-;
-; NOTE: Tests beyond this point only have CHECK lines to validate the first
-; VBYTES because the fadd tests already validate the legalisation code paths.
-;
-
;
; FDIV
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fdiv_v4f16:
-; CHECK: fdiv v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fdiv v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = fdiv <4 x half> %op1, %op2
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fdiv_v8f16:
-; CHECK: fdiv v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fdiv v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = fdiv <8 x half> %op1, %op2
ret <8 x half> %res
}
-define void @fdiv_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fdiv_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fdiv_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%op2 = load <16 x half>, <16 x half>* %b
%res = fdiv <16 x half> %op1, %op2
@@ -340,13 +344,28 @@ define void @fdiv_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
}
define void @fdiv_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fdiv_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fdiv_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fdiv z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: fdiv z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fdiv_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%res = fdiv <32 x half> %op1, %op2
@@ -354,14 +373,15 @@ define void @fdiv_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
ret void
}
-define void @fdiv_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fdiv_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fdiv_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%res = fdiv <64 x half> %op1, %op2
@@ -369,14 +389,15 @@ define void @fdiv_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
ret void
}
-define void @fdiv_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fdiv_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fdiv_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%res = fdiv <128 x half> %op1, %op2
@@ -385,31 +406,34 @@ define void @fdiv_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fdiv_v2f32:
-; CHECK: fdiv v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fdiv v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = fdiv <2 x float> %op1, %op2
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fdiv_v4f32:
-; CHECK: fdiv v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fdiv v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = fdiv <4 x float> %op1, %op2
ret <4 x float> %res
}
-define void @fdiv_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fdiv_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fdiv_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%op2 = load <8 x float>, <8 x float>* %b
%res = fdiv <8 x float> %op1, %op2
@@ -418,13 +442,28 @@ define void @fdiv_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
}
define void @fdiv_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fdiv_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fdiv_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fdiv z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: fdiv z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fdiv_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%res = fdiv <16 x float> %op1, %op2
@@ -432,14 +471,15 @@ define void @fdiv_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @fdiv_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fdiv_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fdiv_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%res = fdiv <32 x float> %op1, %op2
@@ -447,14 +487,15 @@ define void @fdiv_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
ret void
}
-define void @fdiv_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fdiv_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fdiv_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%res = fdiv <64 x float> %op1, %op2
@@ -463,31 +504,34 @@ define void @fdiv_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fdiv_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fdiv_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fdiv_v1f64:
-; CHECK: fdiv d0, d0, d1
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fdiv d0, d0, d1
+; CHECK-NEXT: ret
%res = fdiv <1 x double> %op1, %op2
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fdiv_v2f64:
-; CHECK: fdiv v0.2d, v0.2d, v1.2d
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fdiv v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = fdiv <2 x double> %op1, %op2
ret <2 x double> %res
}
-define void @fdiv_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fdiv_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fdiv_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
%res = fdiv <4 x double> %op1, %op2
@@ -496,13 +540,28 @@ define void @fdiv_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
}
define void @fdiv_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fdiv_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fdiv_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fdiv z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: fdiv z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fdiv_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%res = fdiv <8 x double> %op1, %op2
@@ -510,14 +569,15 @@ define void @fdiv_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @fdiv_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fdiv_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fdiv_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%res = fdiv <16 x double> %op1, %op2
@@ -525,14 +585,15 @@ define void @fdiv_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
ret void
}
-define void @fdiv_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fdiv_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fdiv_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%res = fdiv <32 x double> %op1, %op2
@@ -545,32 +606,37 @@ define void @fdiv_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) #0 {
+define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f16:
-; CHECK: fmla v2.4h, v1.4h, v0.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla v2.4h, v1.4h, v0.4h
+; CHECK-NEXT: fmov d0, d2
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) #0 {
+define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v8f16:
-; CHECK: fmla v2.8h, v1.8h, v0.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla v2.8h, v1.8h, v0.8h
+; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
ret <8 x half> %res
}
-define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 {
+define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
-; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
+; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%op2 = load <16 x half>, <16 x half>* %b
%op3 = load <16 x half>, <16 x half>* %c
@@ -580,14 +646,31 @@ define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 {
}
define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
-; CHECK-LABEL: fma_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
-; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fma_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x2]
+; VBITS_GE_256-NEXT: fmad z0.h, p0/m, z2.h, z4.h
+; VBITS_GE_256-NEXT: fmad z1.h, p0/m, z3.h, z5.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fma_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x2]
+; VBITS_GE_512-NEXT: fmad z0.h, p0/m, z1.h, z2.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%op3 = load <32 x half>, <32 x half>* %c
@@ -596,15 +679,16 @@ define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
ret void
}
-define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 {
+define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
-; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
+; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%op3 = load <64 x half>, <64 x half>* %c
@@ -613,15 +697,16 @@ define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 {
ret void
}
-define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #0 {
+define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
-; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
+; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%op3 = load <128 x half>, <128 x half>* %c
@@ -631,32 +716,37 @@ define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) #0 {
+define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v2f32:
-; CHECK: fmla v2.2s, v1.2s, v0.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla v2.2s, v1.2s, v0.2s
+; CHECK-NEXT: fmov d0, d2
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) #0 {
+define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f32:
-; CHECK: fmla v2.4s, v1.4s, v0.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla v2.4s, v1.4s, v0.4s
+; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3)
ret <4 x float> %res
}
-define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 {
+define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
-; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
+; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%op2 = load <8 x float>, <8 x float>* %b
%op3 = load <8 x float>, <8 x float>* %c
@@ -666,14 +756,31 @@ define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 {
}
define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0 {
-; CHECK-LABEL: fma_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
-; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fma_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2]
+; VBITS_GE_256-NEXT: fmad z0.s, p0/m, z2.s, z4.s
+; VBITS_GE_256-NEXT: fmad z1.s, p0/m, z3.s, z5.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fma_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x2]
+; VBITS_GE_512-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%op3 = load <16 x float>, <16 x float>* %c
@@ -682,15 +789,16 @@ define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0
ret void
}
-define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0 {
+define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
-; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
+; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%op3 = load <32 x float>, <32 x float>* %c
@@ -699,15 +807,16 @@ define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0
ret void
}
-define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0 {
+define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
-; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
+; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%op3 = load <64 x float>, <64 x float>* %c
@@ -717,32 +826,36 @@ define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) #0 {
+define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v1f64:
-; CHECK: fmadd d0, d0, d1, d2
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmadd d0, d0, d1, d2
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.fma.v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) #0 {
+define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v2f64:
-; CHECK: fmla v2.2d, v1.2d, v0.2d
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla v2.2d, v1.2d, v0.2d
+; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3)
ret <2 x double> %res
}
-define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0 {
+define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
-; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
+; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
%op3 = load <4 x double>, <4 x double>* %c
@@ -752,14 +865,31 @@ define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0
}
define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0 {
-; CHECK-LABEL: fma_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
-; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fma_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x2]
+; VBITS_GE_256-NEXT: fmad z0.d, p0/m, z2.d, z4.d
+; VBITS_GE_256-NEXT: fmad z1.d, p0/m, z3.d, z5.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fma_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x2]
+; VBITS_GE_512-NEXT: fmad z0.d, p0/m, z1.d, z2.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%op3 = load <8 x double>, <8 x double>* %c
@@ -768,15 +898,16 @@ define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0
ret void
}
-define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) #0 {
+define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
-; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
+; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%op3 = load <16 x double>, <16 x double>* %c
@@ -785,15 +916,16 @@ define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c)
ret void
}
-define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) #0 {
+define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
-; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
+; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%op3 = load <32 x double>, <32 x double>* %c
@@ -807,31 +939,34 @@ define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c)
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmul_v4f16:
-; CHECK: fmul v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmul v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = fmul <4 x half> %op1, %op2
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmul_v8f16:
-; CHECK: fmul v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmul v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = fmul <8 x half> %op1, %op2
ret <8 x half> %res
}
-define void @fmul_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fmul_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fmul_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%op2 = load <16 x half>, <16 x half>* %b
%res = fmul <16 x half> %op1, %op2
@@ -840,13 +975,28 @@ define void @fmul_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
}
define void @fmul_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fmul_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fmul_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fmul z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmul_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fmul z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%res = fmul <32 x half> %op1, %op2
@@ -854,14 +1004,15 @@ define void @fmul_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
ret void
}
-define void @fmul_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fmul_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fmul_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%res = fmul <64 x half> %op1, %op2
@@ -869,14 +1020,15 @@ define void @fmul_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
ret void
}
-define void @fmul_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fmul_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fmul_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%res = fmul <128 x half> %op1, %op2
@@ -885,31 +1037,34 @@ define void @fmul_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmul_v2f32:
-; CHECK: fmul v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = fmul <2 x float> %op1, %op2
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmul_v4f32:
-; CHECK: fmul v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = fmul <4 x float> %op1, %op2
ret <4 x float> %res
}
-define void @fmul_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fmul_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fmul_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%op2 = load <8 x float>, <8 x float>* %b
%res = fmul <8 x float> %op1, %op2
@@ -918,13 +1073,28 @@ define void @fmul_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
}
define void @fmul_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fmul_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fmul_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fmul z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmul_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fmul z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%res = fmul <16 x float> %op1, %op2
@@ -932,14 +1102,15 @@ define void @fmul_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @fmul_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fmul_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fmul_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%res = fmul <32 x float> %op1, %op2
@@ -947,14 +1118,15 @@ define void @fmul_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
ret void
}
-define void @fmul_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fmul_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fmul_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%res = fmul <64 x float> %op1, %op2
@@ -963,31 +1135,34 @@ define void @fmul_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fmul_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fmul_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmul_v1f64:
-; CHECK: fmul d0, d0, d1
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmul d0, d0, d1
+; CHECK-NEXT: ret
%res = fmul <1 x double> %op1, %op2
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmul_v2f64:
-; CHECK: fmul v0.2d, v0.2d, v1.2d
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = fmul <2 x double> %op1, %op2
ret <2 x double> %res
}
-define void @fmul_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fmul_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fmul_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
%res = fmul <4 x double> %op1, %op2
@@ -996,13 +1171,28 @@ define void @fmul_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
}
define void @fmul_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fmul_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fmul_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fmul z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: fmul z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmul_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fmul z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%res = fmul <8 x double> %op1, %op2
@@ -1010,14 +1200,15 @@ define void @fmul_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @fmul_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fmul_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fmul_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%res = fmul <16 x double> %op1, %op2
@@ -1025,14 +1216,15 @@ define void @fmul_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
ret void
}
-define void @fmul_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fmul_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fmul_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%res = fmul <32 x double> %op1, %op2
@@ -1045,30 +1237,33 @@ define void @fmul_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fneg_v4f16(<4 x half> %op) #0 {
+define <4 x half> @fneg_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fneg_v4f16:
-; CHECK: fneg v0.4h, v0.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fneg v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = fneg <4 x half> %op
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fneg_v8f16(<8 x half> %op) #0 {
+define <8 x half> @fneg_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fneg_v8f16:
-; CHECK: fneg v0.8h, v0.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fneg v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = fneg <8 x half> %op
ret <8 x half> %res
}
-define void @fneg_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fneg_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fneg_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fneg z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = fneg <16 x half> %op
store <16 x half> %res, <16 x half>* %a
@@ -1076,38 +1271,53 @@ define void @fneg_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
}
define void @fneg_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: fneg_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fneg_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fneg z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: fneg z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fneg_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fneg z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = fneg <32 x half> %op
store <32 x half> %res, <32 x half>* %a
ret void
}
-define void @fneg_v64f16(<64 x half>* %a) #0 {
+define void @fneg_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fneg_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fneg z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = fneg <64 x half> %op
store <64 x half> %res, <64 x half>* %a
ret void
}
-define void @fneg_v128f16(<128 x half>* %a) #0 {
+define void @fneg_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fneg_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fneg z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = fneg <128 x half> %op
store <128 x half> %res, <128 x half>* %a
@@ -1115,30 +1325,33 @@ define void @fneg_v128f16(<128 x half>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fneg_v2f32(<2 x float> %op) #0 {
+define <2 x float> @fneg_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fneg_v2f32:
-; CHECK: fneg v0.2s, v0.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fneg v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = fneg <2 x float> %op
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fneg_v4f32(<4 x float> %op) #0 {
+define <4 x float> @fneg_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fneg_v4f32:
-; CHECK: fneg v0.4s, v0.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fneg v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = fneg <4 x float> %op
ret <4 x float> %res
}
-define void @fneg_v8f32(<8 x float>* %a) #0 {
+define void @fneg_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fneg_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fneg z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = fneg <8 x float> %op
store <8 x float> %res, <8 x float>* %a
@@ -1146,38 +1359,53 @@ define void @fneg_v8f32(<8 x float>* %a) #0 {
}
define void @fneg_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: fneg_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fneg_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fneg z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: fneg z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fneg_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fneg z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = fneg <16 x float> %op
store <16 x float> %res, <16 x float>* %a
ret void
}
-define void @fneg_v32f32(<32 x float>* %a) #0 {
+define void @fneg_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fneg_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fneg z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = fneg <32 x float> %op
store <32 x float> %res, <32 x float>* %a
ret void
}
-define void @fneg_v64f32(<64 x float>* %a) #0 {
+define void @fneg_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fneg_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fneg z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = fneg <64 x float> %op
store <64 x float> %res, <64 x float>* %a
@@ -1185,30 +1413,33 @@ define void @fneg_v64f32(<64 x float>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fneg_v1f64(<1 x double> %op) #0 {
+define <1 x double> @fneg_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fneg_v1f64:
-; CHECK: fneg d0, d0
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fneg d0, d0
+; CHECK-NEXT: ret
%res = fneg <1 x double> %op
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fneg_v2f64(<2 x double> %op) #0 {
+define <2 x double> @fneg_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fneg_v2f64:
-; CHECK: fneg v0.2d, v0.2d
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fneg v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = fneg <2 x double> %op
ret <2 x double> %res
}
-define void @fneg_v4f64(<4 x double>* %a) #0 {
+define void @fneg_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fneg_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fneg z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = fneg <4 x double> %op
store <4 x double> %res, <4 x double>* %a
@@ -1216,38 +1447,53 @@ define void @fneg_v4f64(<4 x double>* %a) #0 {
}
define void @fneg_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: fneg_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fneg_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fneg z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: fneg z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fneg_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fneg z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = fneg <8 x double> %op
store <8 x double> %res, <8 x double>* %a
ret void
}
-define void @fneg_v16f64(<16 x double>* %a) #0 {
+define void @fneg_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fneg_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fneg z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = fneg <16 x double> %op
store <16 x double> %res, <16 x double>* %a
ret void
}
-define void @fneg_v32f64(<32 x double>* %a) #0 {
+define void @fneg_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fneg_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fneg z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = fneg <32 x double> %op
store <32 x double> %res, <32 x double>* %a
@@ -1259,30 +1505,33 @@ define void @fneg_v32f64(<32 x double>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fsqrt_v4f16(<4 x half> %op) #0 {
+define <4 x half> @fsqrt_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fsqrt_v4f16:
-; CHECK: fsqrt v0.4h, v0.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsqrt v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fsqrt_v8f16(<8 x half> %op) #0 {
+define <8 x half> @fsqrt_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fsqrt_v8f16:
-; CHECK: fsqrt v0.8h, v0.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsqrt v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op)
ret <8 x half> %res
}
-define void @fsqrt_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fsqrt_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fsqrt_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op)
store <16 x half> %res, <16 x half>* %a
@@ -1290,38 +1539,53 @@ define void @fsqrt_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
}
define void @fsqrt_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: fsqrt_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fsqrt_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fsqrt z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: fsqrt z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fsqrt_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fsqrt z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %op)
store <32 x half> %res, <32 x half>* %a
ret void
}
-define void @fsqrt_v64f16(<64 x half>* %a) #0 {
+define void @fsqrt_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fsqrt_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call <64 x half> @llvm.sqrt.v64f16(<64 x half> %op)
store <64 x half> %res, <64 x half>* %a
ret void
}
-define void @fsqrt_v128f16(<128 x half>* %a) #0 {
+define void @fsqrt_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fsqrt_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call <128 x half> @llvm.sqrt.v128f16(<128 x half> %op)
store <128 x half> %res, <128 x half>* %a
@@ -1329,30 +1593,33 @@ define void @fsqrt_v128f16(<128 x half>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fsqrt_v2f32(<2 x float> %op) #0 {
+define <2 x float> @fsqrt_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fsqrt_v2f32:
-; CHECK: fsqrt v0.2s, v0.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsqrt v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fsqrt_v4f32(<4 x float> %op) #0 {
+define <4 x float> @fsqrt_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fsqrt_v4f32:
-; CHECK: fsqrt v0.4s, v0.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsqrt v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op)
ret <4 x float> %res
}
-define void @fsqrt_v8f32(<8 x float>* %a) #0 {
+define void @fsqrt_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fsqrt_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op)
store <8 x float> %res, <8 x float>* %a
@@ -1360,38 +1627,53 @@ define void @fsqrt_v8f32(<8 x float>* %a) #0 {
}
define void @fsqrt_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: fsqrt_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fsqrt_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fsqrt z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: fsqrt z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fsqrt_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fsqrt z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %op)
store <16 x float> %res, <16 x float>* %a
ret void
}
-define void @fsqrt_v32f32(<32 x float>* %a) #0 {
+define void @fsqrt_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fsqrt_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call <32 x float> @llvm.sqrt.v32f32(<32 x float> %op)
store <32 x float> %res, <32 x float>* %a
ret void
}
-define void @fsqrt_v64f32(<64 x float>* %a) #0 {
+define void @fsqrt_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fsqrt_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call <64 x float> @llvm.sqrt.v64f32(<64 x float> %op)
store <64 x float> %res, <64 x float>* %a
@@ -1399,30 +1681,33 @@ define void @fsqrt_v64f32(<64 x float>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fsqrt_v1f64(<1 x double> %op) #0 {
+define <1 x double> @fsqrt_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fsqrt_v1f64:
-; CHECK: fsqrt d0, d0
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsqrt d0, d0
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %op)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fsqrt_v2f64(<2 x double> %op) #0 {
+define <2 x double> @fsqrt_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fsqrt_v2f64:
-; CHECK: fsqrt v0.2d, v0.2d
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsqrt v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op)
ret <2 x double> %res
}
-define void @fsqrt_v4f64(<4 x double>* %a) #0 {
+define void @fsqrt_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fsqrt_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op)
store <4 x double> %res, <4 x double>* %a
@@ -1430,38 +1715,53 @@ define void @fsqrt_v4f64(<4 x double>* %a) #0 {
}
define void @fsqrt_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: fsqrt_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fsqrt_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fsqrt z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: fsqrt z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fsqrt_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fsqrt z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %op)
store <8 x double> %res, <8 x double>* %a
ret void
}
-define void @fsqrt_v16f64(<16 x double>* %a) #0 {
+define void @fsqrt_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fsqrt_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call <16 x double> @llvm.sqrt.v16f64(<16 x double> %op)
store <16 x double> %res, <16 x double>* %a
ret void
}
-define void @fsqrt_v32f64(<32 x double>* %a) #0 {
+define void @fsqrt_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fsqrt_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call <32 x double> @llvm.sqrt.v32f64(<32 x double> %op)
store <32 x double> %res, <32 x double>* %a
@@ -1473,31 +1773,34 @@ define void @fsqrt_v32f64(<32 x double>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fsub_v4f16:
-; CHECK: fsub v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = fsub <4 x half> %op1, %op2
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fsub_v8f16:
-; CHECK: fsub v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsub v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = fsub <8 x half> %op1, %op2
ret <8 x half> %res
}
-define void @fsub_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fsub_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fsub_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%op2 = load <16 x half>, <16 x half>* %b
%res = fsub <16 x half> %op1, %op2
@@ -1506,13 +1809,28 @@ define void @fsub_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
}
define void @fsub_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fsub_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fsub_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fsub z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: fsub z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fsub_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fsub z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%res = fsub <32 x half> %op1, %op2
@@ -1520,14 +1838,15 @@ define void @fsub_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
ret void
}
-define void @fsub_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fsub_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fsub_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%res = fsub <64 x half> %op1, %op2
@@ -1535,14 +1854,15 @@ define void @fsub_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
ret void
}
-define void @fsub_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fsub_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fsub_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%res = fsub <128 x half> %op1, %op2
@@ -1551,31 +1871,34 @@ define void @fsub_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fsub_v2f32:
-; CHECK: fsub v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = fsub <2 x float> %op1, %op2
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fsub_v4f32:
-; CHECK: fsub v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = fsub <4 x float> %op1, %op2
ret <4 x float> %res
}
-define void @fsub_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fsub_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fsub_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%op2 = load <8 x float>, <8 x float>* %b
%res = fsub <8 x float> %op1, %op2
@@ -1584,13 +1907,28 @@ define void @fsub_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
}
define void @fsub_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fsub_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fsub_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fsub z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: fsub z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fsub_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fsub z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%res = fsub <16 x float> %op1, %op2
@@ -1598,14 +1936,15 @@ define void @fsub_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @fsub_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fsub_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fsub_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%res = fsub <32 x float> %op1, %op2
@@ -1613,14 +1952,15 @@ define void @fsub_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
ret void
}
-define void @fsub_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fsub_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fsub_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%res = fsub <64 x float> %op1, %op2
@@ -1629,31 +1969,34 @@ define void @fsub_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fsub_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fsub_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fsub_v1f64:
-; CHECK: fsub d0, d0, d1
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsub d0, d0, d1
+; CHECK-NEXT: ret
%res = fsub <1 x double> %op1, %op2
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fsub_v2f64:
-; CHECK: fsub v0.2d, v0.2d, v1.2d
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = fsub <2 x double> %op1, %op2
ret <2 x double> %res
}
-define void @fsub_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fsub_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fsub_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
%res = fsub <4 x double> %op1, %op2
@@ -1662,13 +2005,28 @@ define void @fsub_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
}
define void @fsub_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fsub_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fsub_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fsub z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: fsub z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fsub_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fsub z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%res = fsub <8 x double> %op1, %op2
@@ -1676,14 +2034,15 @@ define void @fsub_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @fsub_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fsub_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fsub_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%res = fsub <16 x double> %op1, %op2
@@ -1691,14 +2050,15 @@ define void @fsub_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
ret void
}
-define void @fsub_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fsub_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fsub_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%res = fsub <32 x double> %op1, %op2
@@ -1711,30 +2071,33 @@ define void @fsub_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fabs_v4f16(<4 x half> %op) #0 {
+define <4 x half> @fabs_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fabs_v4f16:
-; CHECK: fabs v0.4h, v0.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fabs v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fabs_v8f16(<8 x half> %op) #0 {
+define <8 x half> @fabs_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fabs_v8f16:
-; CHECK: fabs v0.8h, v0.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fabs v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op)
ret <8 x half> %res
}
-define void @fabs_v16f16(<16 x half>* %a) #0 {
+define void @fabs_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fabs_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fabs z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op)
store <16 x half> %res, <16 x half>* %a
@@ -1742,38 +2105,53 @@ define void @fabs_v16f16(<16 x half>* %a) #0 {
}
define void @fabs_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: fabs_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fabs_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fabs z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: fabs z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fabs_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fabs z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call <32 x half> @llvm.fabs.v32f16(<32 x half> %op)
store <32 x half> %res, <32 x half>* %a
ret void
}
-define void @fabs_v64f16(<64 x half>* %a) #0 {
+define void @fabs_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fabs_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fabs z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call <64 x half> @llvm.fabs.v64f16(<64 x half> %op)
store <64 x half> %res, <64 x half>* %a
ret void
}
-define void @fabs_v128f16(<128 x half>* %a) #0 {
+define void @fabs_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fabs_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fabs z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call <128 x half> @llvm.fabs.v128f16(<128 x half> %op)
store <128 x half> %res, <128 x half>* %a
@@ -1781,30 +2159,33 @@ define void @fabs_v128f16(<128 x half>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fabs_v2f32(<2 x float> %op) #0 {
+define <2 x float> @fabs_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fabs_v2f32:
-; CHECK: fabs v0.2s, v0.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fabs v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fabs_v4f32(<4 x float> %op) #0 {
+define <4 x float> @fabs_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fabs_v4f32:
-; CHECK: fabs v0.4s, v0.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fabs v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op)
ret <4 x float> %res
}
-define void @fabs_v8f32(<8 x float>* %a) #0 {
+define void @fabs_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fabs_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fabs z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op)
store <8 x float> %res, <8 x float>* %a
@@ -1812,38 +2193,53 @@ define void @fabs_v8f32(<8 x float>* %a) #0 {
}
define void @fabs_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: fabs_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fabs_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fabs z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: fabs z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fabs_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fabs z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call <16 x float> @llvm.fabs.v16f32(<16 x float> %op)
store <16 x float> %res, <16 x float>* %a
ret void
}
-define void @fabs_v32f32(<32 x float>* %a) #0 {
+define void @fabs_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fabs_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fabs z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call <32 x float> @llvm.fabs.v32f32(<32 x float> %op)
store <32 x float> %res, <32 x float>* %a
ret void
}
-define void @fabs_v64f32(<64 x float>* %a) #0 {
+define void @fabs_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fabs_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fabs z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call <64 x float> @llvm.fabs.v64f32(<64 x float> %op)
store <64 x float> %res, <64 x float>* %a
@@ -1851,30 +2247,33 @@ define void @fabs_v64f32(<64 x float>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fabs_v1f64(<1 x double> %op) #0 {
+define <1 x double> @fabs_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fabs_v1f64:
-; CHECK: fabs d0, d0
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fabs d0, d0
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.fabs.v1f64(<1 x double> %op)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fabs_v2f64(<2 x double> %op) #0 {
+define <2 x double> @fabs_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: fabs_v2f64:
-; CHECK: fabs v0.2d, v0.2d
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fabs v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op)
ret <2 x double> %res
}
-define void @fabs_v4f64(<4 x double>* %a) #0 {
+define void @fabs_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fabs_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fabs z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op)
store <4 x double> %res, <4 x double>* %a
@@ -1882,38 +2281,53 @@ define void @fabs_v4f64(<4 x double>* %a) #0 {
}
define void @fabs_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: fabs_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fabs_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fabs z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: fabs z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fabs_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fabs z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call <8 x double> @llvm.fabs.v8f64(<8 x double> %op)
store <8 x double> %res, <8 x double>* %a
ret void
}
-define void @fabs_v16f64(<16 x double>* %a) #0 {
+define void @fabs_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fabs_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fabs z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call <16 x double> @llvm.fabs.v16f64(<16 x double> %op)
store <16 x double> %res, <16 x double>* %a
ret void
}
-define void @fabs_v32f64(<32 x double>* %a) #0 {
+define void @fabs_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fabs_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fabs z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call <32 x double> @llvm.fabs.v32f64(<32 x double> %op)
store <32 x double> %res, <32 x double>* %a
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll
similarity index 57%
rename from llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll
rename to llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll
index 2180a405b53c..db7bef039f66 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll
@@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep 'z[0-9]'
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
;
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.4h, v0.4h, v1.4h
@@ -35,7 +21,7 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.8h, v0.8h, v1.8h
@@ -45,7 +31,7 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
ret <8 x i16> %sext
}
-define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -64,7 +50,6 @@ define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
}
define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #0 {
-; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcmp_oeq_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
@@ -98,44 +83,16 @@ define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #
ret void
}
-define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #0 {
-; VBITS_GE_256-LABEL: fcmp_oeq_v64f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #32
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h
-; VBITS_GE_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z6.h
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, z7.h
-; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: fcmp_oeq_v64f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
-; VBITS_GE_1024-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x2]
-; VBITS_GE_1024-NEXT: ret
+define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcmp_oeq_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1h { z0.h }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%cmp = fcmp oeq <64 x half> %op1, %op2
@@ -144,68 +101,16 @@ define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #
ret void
}
-define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) #0 {
-; VBITS_GE_256-LABEL: fcmp_oeq_v128f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #96
-; VBITS_GE_256-NEXT: mov x9, #112
-; VBITS_GE_256-NEXT: mov x10, #64
-; VBITS_GE_256-NEXT: mov x11, #80
-; VBITS_GE_256-NEXT: mov x12, #32
-; VBITS_GE_256-NEXT: mov x13, #48
-; VBITS_GE_256-NEXT: mov x14, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h
-; VBITS_GE_256-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z6.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z4.h, z21.h
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, z20.h
-; VBITS_GE_256-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z4.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z22.h
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z19.h
-; VBITS_GE_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z18.h
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z7.h, z23.h
-; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z7.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x2, x11, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x2, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x2, x13, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x2, x14, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: fcmp_oeq_v128f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
-; VBITS_GE_2048-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x2]
-; VBITS_GE_2048-NEXT: ret
+define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcmp_oeq_v128f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1h { z0.h }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%cmp = fcmp oeq <128 x half> %op1, %op2
@@ -215,7 +120,7 @@ define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s
@@ -226,7 +131,7 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s
@@ -236,7 +141,7 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
ret <4 x i32> %sext
}
-define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) #0 {
+define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -255,7 +160,6 @@ define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) #0
}
define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c) #0 {
-; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcmp_oeq_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
@@ -289,44 +193,16 @@ define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c)
ret void
}
-define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) #0 {
-; VBITS_GE_256-LABEL: fcmp_oeq_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s
-; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z6.s
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, z7.s
-; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: fcmp_oeq_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_1024-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x2]
-; VBITS_GE_1024-NEXT: ret
+define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcmp_oeq_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%cmp = fcmp oeq <32 x float> %op1, %op2
@@ -335,68 +211,16 @@ define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c)
ret void
}
-define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) #0 {
-; VBITS_GE_256-LABEL: fcmp_oeq_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #48
-; VBITS_GE_256-NEXT: mov x9, #56
-; VBITS_GE_256-NEXT: mov x10, #32
-; VBITS_GE_256-NEXT: mov x11, #40
-; VBITS_GE_256-NEXT: mov x12, #16
-; VBITS_GE_256-NEXT: mov x13, #24
-; VBITS_GE_256-NEXT: mov x14, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s
-; VBITS_GE_256-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z6.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z4.s, z21.s
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, z20.s
-; VBITS_GE_256-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z4.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z22.s
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z19.s
-; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z18.s
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z7.s, z23.s
-; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x2, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x2, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x2, x13, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x2, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: fcmp_oeq_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_2048-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x2]
-; VBITS_GE_2048-NEXT: ret
+define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcmp_oeq_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%cmp = fcmp oeq <64 x float> %op1, %op2
@@ -406,7 +230,7 @@ define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c)
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq d0, d0, d1
@@ -417,7 +241,7 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.2d, v0.2d, v1.2d
@@ -427,7 +251,7 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
ret <2 x i64> %sext
}
-define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) #0 {
+define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -446,7 +270,6 @@ define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) #
}
define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #0 {
-; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcmp_oeq_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
@@ -480,44 +303,16 @@ define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #
ret void
}
-define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) #0 {
-; VBITS_GE_256-LABEL: fcmp_oeq_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d
-; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z6.d
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, z7.d
-; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z3.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: fcmp_oeq_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
-; VBITS_GE_1024-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x2]
-; VBITS_GE_1024-NEXT: ret
+define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcmp_oeq_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1d { z0.d }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%cmp = fcmp oeq <16 x double> %op1, %op2
@@ -526,68 +321,16 @@ define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %
ret void
}
-define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) #0 {
-; VBITS_GE_256-LABEL: fcmp_oeq_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: mov x12, #8
-; VBITS_GE_256-NEXT: mov x13, #12
-; VBITS_GE_256-NEXT: mov x14, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d
-; VBITS_GE_256-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z6.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z4.d, z21.d
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, z20.d
-; VBITS_GE_256-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z4.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z22.d
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z19.d
-; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z18.d
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z7.d, z23.d
-; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z7.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x2, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x2, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x2, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x2, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: fcmp_oeq_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
-; VBITS_GE_2048-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x2]
-; VBITS_GE_2048-NEXT: ret
+define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcmp_oeq_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1d { z0.d }, p0, [x2]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%cmp = fcmp oeq <32 x double> %op1, %op2
@@ -600,7 +343,7 @@ define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %
; FCMP UEQ
;
-define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ueq_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -624,7 +367,7 @@ define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP ONE
;
-define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_one_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -648,7 +391,7 @@ define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP UNE
;
-define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_une_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -670,7 +413,7 @@ define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP OGT
;
-define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ogt_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -692,7 +435,7 @@ define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP UGT
;
-define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ugt_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -716,7 +459,7 @@ define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP OLT
;
-define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_olt_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -738,7 +481,7 @@ define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP ULT
;
-define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ult_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -762,7 +505,7 @@ define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP OGE
;
-define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oge_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -784,7 +527,7 @@ define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP UGE
;
-define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_uge_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -808,7 +551,7 @@ define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP OLE
;
-define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ole_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -830,7 +573,7 @@ define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP ULE
;
-define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ule_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -854,7 +597,7 @@ define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP UNO
;
-define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_uno_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -876,7 +619,7 @@ define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP ORD
;
-define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ord_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -900,7 +643,7 @@ define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP EQ
;
-define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_eq_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -922,7 +665,7 @@ define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP NE
;
-define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ne_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -944,7 +687,7 @@ define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP GT
;
-define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_gt_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -966,7 +709,7 @@ define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP LT
;
-define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_lt_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -988,7 +731,7 @@ define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP GE
;
-define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ge_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -1010,7 +753,7 @@ define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP LE
;
-define void @fcmp_le_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_le_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_le_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
index 0ff987e2c461..67991510d310 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
@@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
;
; Don't use SVE for 64-bit vectors.
-define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) #0 {
+define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f16_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
@@ -38,7 +24,7 @@ define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
-define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) #0 {
+define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f16_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -51,7 +37,7 @@ define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) #0 {
ret void
}
-define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
+define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v8f16_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -66,7 +52,6 @@ define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
}
define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: fcvt_v16f16_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
@@ -86,91 +71,34 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
-
%op1 = load <16 x half>, <16 x half>* %a
%res = fpext <16 x half> %op1 to <16 x float>
store <16 x float> %res, <16 x float>* %b
ret void
}
-define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v32f16_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.h
-; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.h
-; VBITS_GE_256-NEXT: fcvt z2.s, p0/m, z2.h
-; VBITS_GE_256-NEXT: fcvt z3.s, p0/m, z3.h
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.h
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcvt_v32f16_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fcvt z0.s, p0/m, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%res = fpext <32 x half> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b
ret void
}
-define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v64f16_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x11, #40
-; VBITS_GE_256-NEXT: mov x12, #32
-; VBITS_GE_256-NEXT: mov x13, #56
-; VBITS_GE_256-NEXT: mov x14, #48
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.s }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.s }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z5.s }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z6.s }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.h
-; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.h
-; VBITS_GE_256-NEXT: fcvt z2.s, p0/m, z2.h
-; VBITS_GE_256-NEXT: fcvt z3.s, p0/m, z3.h
-; VBITS_GE_256-NEXT: fcvt z4.s, p0/m, z4.h
-; VBITS_GE_256-NEXT: fcvt z5.s, p0/m, z5.h
-; VBITS_GE_256-NEXT: fcvt z6.s, p0/m, z6.h
-; VBITS_GE_256-NEXT: fcvt z7.s, p0/m, z7.h
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.h
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcvt_v64f16_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fcvt z0.s, p0/m, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%res = fpext <64 x half> %op1 to <64 x float>
store <64 x float> %res, <64 x float>* %b
@@ -182,7 +110,7 @@ define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) #0 {
+define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v1f16_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr h0, [x0]
@@ -196,7 +124,7 @@ define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) #0 {
}
; v2f16 is not legal for NEON, so use SVE
-define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) #0 {
+define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f16_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
@@ -212,7 +140,7 @@ define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) #0 {
ret void
}
-define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
+define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f16_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -246,91 +174,34 @@ define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
-
%op1 = load <8 x half>, <8 x half>* %a
%res = fpext <8 x half> %op1 to <8 x double>
store <8 x double> %res, <8 x double>* %b
ret void
}
-define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v16f16_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.d }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.h
-; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.h
-; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.h
-; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.h
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.h
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcvt_v16f16_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvt z0.d, p0/m, z0.h
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%res = fpext <16 x half> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b
ret void
}
-define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v32f16_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: mov x12, #16
-; VBITS_GE_256-NEXT: mov x13, #28
-; VBITS_GE_256-NEXT: mov x14, #24
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.d }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.d }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.d }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z5.d }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z6.d }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.h
-; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.h
-; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.h
-; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.h
-; VBITS_GE_256-NEXT: fcvt z4.d, p0/m, z4.h
-; VBITS_GE_256-NEXT: fcvt z5.d, p0/m, z5.h
-; VBITS_GE_256-NEXT: fcvt z6.d, p0/m, z6.h
-; VBITS_GE_256-NEXT: fcvt z7.d, p0/m, z7.h
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.h
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcvt_v32f16_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvt z0.d, p0/m, z0.h
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%res = fpext <32 x half> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b
@@ -342,7 +213,7 @@ define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) #0 {
+define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v1f32_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
@@ -356,7 +227,7 @@ define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
-define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) #0 {
+define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f32_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -369,7 +240,7 @@ define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) #0 {
ret void
}
-define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
+define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f32_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -384,7 +255,6 @@ define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
}
define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: fcvt_v8f32_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
@@ -410,84 +280,28 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v16f32_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.s
-; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.s
-; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.s
-; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.s
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.s
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcvt_v16f32_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%res = fpext <16 x float> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b
ret void
}
-define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v32f32_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: mov x12, #16
-; VBITS_GE_256-NEXT: mov x13, #28
-; VBITS_GE_256-NEXT: mov x14, #24
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.d }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.d }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.d }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.d }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.s
-; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.s
-; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.s
-; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.s
-; VBITS_GE_256-NEXT: fcvt z4.d, p0/m, z4.s
-; VBITS_GE_256-NEXT: fcvt z5.d, p0/m, z5.s
-; VBITS_GE_256-NEXT: fcvt z6.d, p0/m, z6.s
-; VBITS_GE_256-NEXT: fcvt z7.d, p0/m, z7.s
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.s
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcvt_v32f32_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%res = fpext <32 x float> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b
@@ -499,7 +313,7 @@ define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) #0 {
+define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f32_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -513,7 +327,7 @@ define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
-define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) #0 {
+define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f32_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -526,7 +340,7 @@ define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) #0 {
ret void
}
-define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) #0 {
+define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v8f32_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -541,7 +355,18 @@ define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) #0 {
}
define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
-; Ensure sensible type legalisation
+; VBITS_GE_256-LABEL: fcvt_v16f32_v16f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.s
+; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@@ -555,90 +380,28 @@ define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
ret void
}
-define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v32f32_v32f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.s
-; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.s
-; VBITS_GE_256-NEXT: fcvt z2.h, p0/m, z2.s
-; VBITS_GE_256-NEXT: fcvt z3.h, p0/m, z3.s
-; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.s }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: fcvt_v32f32_v32f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.s
-; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcvt_v32f32_v32f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%res = fptrunc <32 x float> %op1 to <32 x half>
store <32 x half> %res, <32 x half>* %b
ret void
}
-define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v64f32_v64f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #56
-; VBITS_GE_256-NEXT: mov x10, #48
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x11, #24
-; VBITS_GE_256-NEXT: mov x12, #16
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: mov x13, #40
-; VBITS_GE_256-NEXT: mov x14, #32
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.s
-; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.s
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: movprfx z0, z5
-; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z5.s
-; VBITS_GE_256-NEXT: movprfx z1, z4
-; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z4.s
-; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT: movprfx z0, z6
-; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z6.s
-; VBITS_GE_256-NEXT: movprfx z1, z3
-; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z3.s
-; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT: movprfx z0, z2
-; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z2.s
-; VBITS_GE_256-NEXT: movprfx z1, z7
-; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z7.s
-; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: fcvt_v64f32_v64f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.s
-; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcvt_v64f32_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%res = fptrunc <64 x float> %op1 to <64 x half>
store <64 x half> %res, <64 x half>* %b
@@ -650,7 +413,7 @@ define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) #0 {
+define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v1f64_v1f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -664,7 +427,7 @@ define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) #0 {
}
; v2f16 is not legal for NEON, so use SVE
-define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) #0 {
+define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f64_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -680,7 +443,7 @@ define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) #0 {
ret void
}
-define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) #0 {
+define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f64_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -695,7 +458,6 @@ define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) #0 {
}
define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 {
-; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcvt_v8f64_v8f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
@@ -726,70 +488,28 @@ define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 {
ret void
}
-define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 {
-; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.d
-; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcvt_v16f64_v16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvt z0.h, p0/m, z0.d
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptrunc <16 x double> %op1 to <16 x half>
store <16 x half> %res, <16 x half>* %b
ret void
}
-define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v32f64_v32f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov x11, #12
-; VBITS_GE_256-NEXT: mov x12, #8
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x13, #20
-; VBITS_GE_256-NEXT: mov x14, #16
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.d
-; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.d
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: movprfx z0, z5
-; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z5.d
-; VBITS_GE_256-NEXT: movprfx z1, z4
-; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z4.d
-; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT: movprfx z0, z6
-; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z6.d
-; VBITS_GE_256-NEXT: movprfx z1, z3
-; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z3.d
-; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT: movprfx z0, z2
-; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z2.d
-; VBITS_GE_256-NEXT: movprfx z1, z7
-; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z7.d
-; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.d
-; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcvt_v32f64_v32f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvt z0.h, p0/m, z0.d
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptrunc <32 x double> %op1 to <32 x half>
store <32 x half> %res, <32 x half>* %b
@@ -801,7 +521,7 @@ define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) #0 {
+define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v1f64_v1f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -814,7 +534,7 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
-define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) #0 {
+define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f64_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtn v0.2s, v0.2d
@@ -825,7 +545,7 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) #0 {
ret void
}
-define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) #0 {
+define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f64_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -840,7 +560,18 @@ define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) #0 {
}
define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
-; Ensure sensible type legalisation
+; VBITS_GE_256-LABEL: fcvt_v8f64_v8f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.d
+; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -854,90 +585,28 @@ define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
ret void
}
-define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v16f64_v16f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.d
-; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.d
-; VBITS_GE_256-NEXT: fcvt z2.s, p0/m, z2.d
-; VBITS_GE_256-NEXT: fcvt z3.s, p0/m, z3.d
-; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.d
-; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcvt_v16f64_v16f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvt z0.s, p0/m, z0.d
+; CHECK-NEXT: st1w { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptrunc <16 x double> %op1 to <16 x float>
store <16 x float> %res, <16 x float>* %b
ret void
}
-define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v32f64_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov x11, #12
-; VBITS_GE_256-NEXT: mov x12, #8
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x13, #20
-; VBITS_GE_256-NEXT: mov x14, #16
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.d
-; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.d
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: movprfx z0, z5
-; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z5.d
-; VBITS_GE_256-NEXT: movprfx z1, z4
-; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z4.d
-; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT: movprfx z0, z6
-; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z6.d
-; VBITS_GE_256-NEXT: movprfx z1, z3
-; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z3.d
-; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: movprfx z0, z2
-; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z2.d
-; VBITS_GE_256-NEXT: movprfx z1, z7
-; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z7.d
-; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.d
-; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcvt_v32f64_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvt z0.s, p0/m, z0.d
+; CHECK-NEXT: st1w { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptrunc <32 x double> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
index b22663661796..2d0f8da1efaa 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
@@ -1,5 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O3 -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s
+; RUN: llc -O3 -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -O3 -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -O3 -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@@ -8,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) #0 {
+define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.4h, v0.4h, v1.4h
@@ -20,7 +22,7 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) #0 {
+define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.8h, v0.8h, v1.8h
@@ -31,7 +33,7 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
ret <8 x half> %res
}
-define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 {
+define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -51,15 +53,31 @@ define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 {
}
define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
-; CHECK-LABEL: fma_v32f16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.h, vl32
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
-; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: st1h { z0.h }, p0, [x0]
-; CHECK-NEXT: ret
+; VBITS_GE_256-LABEL: fma_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x2]
+; VBITS_GE_256-NEXT: fmad z0.h, p0/m, z2.h, z4.h
+; VBITS_GE_256-NEXT: fmad z1.h, p0/m, z3.h, z5.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fma_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x2]
+; VBITS_GE_512-NEXT: fmad z0.h, p0/m, z1.h, z2.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%op3 = load <32 x half>, <32 x half>* %c
@@ -69,7 +87,7 @@ define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
ret void
}
-define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 {
+define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v64f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
@@ -88,7 +106,7 @@ define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 {
ret void
}
-define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #0 {
+define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v128f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
@@ -108,7 +126,7 @@ define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) #0 {
+define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s
@@ -120,7 +138,7 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) #0 {
+define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.4s, v0.4s, v1.4s
@@ -131,7 +149,7 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
ret <4 x float> %res
}
-define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 {
+define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -151,15 +169,31 @@ define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 {
}
define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0 {
-; CHECK-LABEL: fma_v16f32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s, vl16
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
-; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT: st1w { z0.s }, p0, [x0]
-; CHECK-NEXT: ret
+; VBITS_GE_256-LABEL: fma_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2]
+; VBITS_GE_256-NEXT: fmad z0.s, p0/m, z2.s, z4.s
+; VBITS_GE_256-NEXT: fmad z1.s, p0/m, z3.s, z5.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fma_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x2]
+; VBITS_GE_512-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%op3 = load <16 x float>, <16 x float>* %c
@@ -169,7 +203,7 @@ define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0
ret void
}
-define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0 {
+define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
@@ -188,7 +222,7 @@ define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0
ret void
}
-define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0 {
+define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
@@ -208,7 +242,7 @@ define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) #0 {
+define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmadd d0, d0, d1, d2
@@ -219,7 +253,7 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) #0 {
+define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.2d, v0.2d, v1.2d
@@ -230,7 +264,7 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
ret <2 x double> %res
}
-define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0 {
+define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -250,15 +284,31 @@ define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0
}
define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0 {
-; CHECK-LABEL: fma_v8f64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.d, vl8
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
-; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT: st1d { z0.d }, p0, [x0]
-; CHECK-NEXT: ret
+; VBITS_GE_256-LABEL: fma_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x2]
+; VBITS_GE_256-NEXT: fmad z0.d, p0/m, z2.d, z4.d
+; VBITS_GE_256-NEXT: fmad z1.d, p0/m, z3.d, z5.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fma_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x2]
+; VBITS_GE_512-NEXT: fmad z0.d, p0/m, z1.d, z2.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%op3 = load <8 x double>, <8 x double>* %c
@@ -268,7 +318,7 @@ define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0
ret void
}
-define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) #0 {
+define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
@@ -287,7 +337,7 @@ define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c)
ret void
}
-define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) #0 {
+define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
index 0ed49e4aaaff..955169c0a130 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
@@ -1,55 +1,43 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; FMAXNM
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxnm_v4f16:
-; CHECK: fmaxnm v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmaxnm v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxnm_v8f16:
-; CHECK: fmaxnm v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmaxnm v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2)
ret <8 x half> %res
}
-define void @fmaxnm_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fmaxnm_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxnm_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%op2 = load <16 x half>, <16 x half>* %b
%res = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -58,26 +46,28 @@ define void @fmaxnm_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
}
define void @fmaxnm_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fmaxnm_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fmaxnm [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: fmaxnm [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmaxnm_v32f16:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: mov x8, #16
+; VBITS_EQ_256-NEXT: ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fmaxnm z0.h, p0/m, z0.h, z2.h
+; VBITS_EQ_256-NEXT: fmaxnm z1.h, p0/m, z1.h, z3.h
+; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmaxnm_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%res = call <32 x half> @llvm.maxnum.v32f16(<32 x half> %op1, <32 x half> %op2)
@@ -85,14 +75,15 @@ define void @fmaxnm_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
ret void
}
-define void @fmaxnm_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fmaxnm_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fmaxnm_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%res = call <64 x half> @llvm.maxnum.v64f16(<64 x half> %op1, <64 x half> %op2)
@@ -100,14 +91,15 @@ define void @fmaxnm_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
ret void
}
-define void @fmaxnm_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fmaxnm_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fmaxnm_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%res = call <128 x half> @llvm.maxnum.v128f16(<128 x half> %op1, <128 x half> %op2)
@@ -116,31 +108,34 @@ define void @fmaxnm_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxnm_v2f32:
-; CHECK: fmaxnm v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmaxnm v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxnm_v4f32:
-; CHECK: fmaxnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2)
ret <4 x float> %res
}
-define void @fmaxnm_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fmaxnm_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxnm_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%op2 = load <8 x float>, <8 x float>* %b
%res = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -149,26 +144,28 @@ define void @fmaxnm_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
}
define void @fmaxnm_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fmaxnm_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fmaxnm [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: fmaxnm [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmaxnm_v16f32:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: mov x8, #8
+; VBITS_EQ_256-NEXT: ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fmaxnm z0.s, p0/m, z0.s, z2.s
+; VBITS_EQ_256-NEXT: fmaxnm z1.s, p0/m, z1.s, z3.s
+; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmaxnm_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%res = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %op1, <16 x float> %op2)
@@ -176,14 +173,15 @@ define void @fmaxnm_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @fmaxnm_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fmaxnm_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fmaxnm_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%res = call <32 x float> @llvm.maxnum.v32f32(<32 x float> %op1, <32 x float> %op2)
@@ -191,14 +189,15 @@ define void @fmaxnm_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
ret void
}
-define void @fmaxnm_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fmaxnm_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fmaxnm_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%res = call <64 x float> @llvm.maxnum.v64f32(<64 x float> %op1, <64 x float> %op2)
@@ -207,31 +206,34 @@ define void @fmaxnm_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxnm_v1f64:
-; CHECK: fmaxnm d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmaxnm d0, d0, d1
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxnm_v2f64:
-; CHECK: fmaxnm v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmaxnm v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2)
ret <2 x double> %res
}
-define void @fmaxnm_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fmaxnm_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxnm_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
%res = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -240,26 +242,28 @@ define void @fmaxnm_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
}
define void @fmaxnm_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fmaxnm_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fmaxnm [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: fmaxnm [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmaxnm_v8f64:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: mov x8, #4
+; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fmaxnm z0.d, p0/m, z0.d, z2.d
+; VBITS_EQ_256-NEXT: fmaxnm z1.d, p0/m, z1.d, z3.d
+; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmaxnm_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%res = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %op1, <8 x double> %op2)
@@ -267,14 +271,15 @@ define void @fmaxnm_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @fmaxnm_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fmaxnm_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fmaxnm_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%res = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %op1, <16 x double> %op2)
@@ -282,14 +287,15 @@ define void @fmaxnm_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
ret void
}
-define void @fmaxnm_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fmaxnm_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fmaxnm_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%res = call <32 x double> @llvm.maxnum.v32f64(<32 x double> %op1, <32 x double> %op2)
@@ -302,31 +308,34 @@ define void @fmaxnm_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fminnm_v4f16:
-; CHECK: fminnm v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fminnm_v8f16:
-; CHECK: fminnm v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2)
ret <8 x half> %res
}
-define void @fminnm_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fminnm_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fminnm_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%op2 = load <16 x half>, <16 x half>* %b
%res = call <16 x half> @llvm.minnum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -335,26 +344,28 @@ define void @fminnm_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
}
define void @fminnm_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fminnm_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fminnm [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: fminnm [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fminnm_v32f16:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: mov x8, #16
+; VBITS_EQ_256-NEXT: ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fminnm z0.h, p0/m, z0.h, z2.h
+; VBITS_EQ_256-NEXT: fminnm z1.h, p0/m, z1.h, z3.h
+; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fminnm_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%res = call <32 x half> @llvm.minnum.v32f16(<32 x half> %op1, <32 x half> %op2)
@@ -362,14 +373,15 @@ define void @fminnm_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
ret void
}
-define void @fminnm_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fminnm_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fminnm_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%res = call <64 x half> @llvm.minnum.v64f16(<64 x half> %op1, <64 x half> %op2)
@@ -377,14 +389,15 @@ define void @fminnm_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
ret void
}
-define void @fminnm_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fminnm_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fminnm_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%res = call <128 x half> @llvm.minnum.v128f16(<128 x half> %op1, <128 x half> %op2)
@@ -393,31 +406,34 @@ define void @fminnm_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fminnm_v2f32:
-; CHECK: fminnm v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fminnm_v4f32:
-; CHECK: fminnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2)
ret <4 x float> %res
}
-define void @fminnm_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fminnm_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fminnm_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%op2 = load <8 x float>, <8 x float>* %b
%res = call <8 x float> @llvm.minnum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -426,26 +442,28 @@ define void @fminnm_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
}
define void @fminnm_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fminnm_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fminnm [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: fminnm [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fminnm_v16f32:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: mov x8, #8
+; VBITS_EQ_256-NEXT: ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fminnm z0.s, p0/m, z0.s, z2.s
+; VBITS_EQ_256-NEXT: fminnm z1.s, p0/m, z1.s, z3.s
+; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fminnm_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%res = call <16 x float> @llvm.minnum.v16f32(<16 x float> %op1, <16 x float> %op2)
@@ -453,14 +471,15 @@ define void @fminnm_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @fminnm_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fminnm_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fminnm_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%res = call <32 x float> @llvm.minnum.v32f32(<32 x float> %op1, <32 x float> %op2)
@@ -468,14 +487,15 @@ define void @fminnm_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
ret void
}
-define void @fminnm_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fminnm_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fminnm_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%res = call <64 x float> @llvm.minnum.v64f32(<64 x float> %op1, <64 x float> %op2)
@@ -484,31 +504,34 @@ define void @fminnm_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fminnm_v1f64:
-; CHECK: fminnm d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm d0, d0, d1
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fminnm_v2f64:
-; CHECK: fminnm v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2)
ret <2 x double> %res
}
-define void @fminnm_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fminnm_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fminnm_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
%res = call <4 x double> @llvm.minnum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -517,26 +540,28 @@ define void @fminnm_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
}
define void @fminnm_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fminnm_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fminnm [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: fminnm [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fminnm_v8f64:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: mov x8, #4
+; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fminnm z0.d, p0/m, z0.d, z2.d
+; VBITS_EQ_256-NEXT: fminnm z1.d, p0/m, z1.d, z3.d
+; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fminnm_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%res = call <8 x double> @llvm.minnum.v8f64(<8 x double> %op1, <8 x double> %op2)
@@ -544,14 +569,15 @@ define void @fminnm_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @fminnm_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fminnm_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fminnm_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%res = call <16 x double> @llvm.minnum.v16f64(<16 x double> %op1, <16 x double> %op2)
@@ -559,14 +585,15 @@ define void @fminnm_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
ret void
}
-define void @fminnm_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fminnm_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fminnm_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%res = call <32 x double> @llvm.minnum.v32f64(<32 x double> %op1, <32 x double> %op2)
@@ -579,31 +606,34 @@ define void @fminnm_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmax_v4f16:
-; CHECK: fmax v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmax v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmax_v8f16:
-; CHECK: fmax v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmax v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2)
ret <8 x half> %res
}
-define void @fmax_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fmax_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fmax_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%op2 = load <16 x half>, <16 x half>* %b
%res = call <16 x half> @llvm.maximum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -612,26 +642,28 @@ define void @fmax_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
}
define void @fmax_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fmax_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fmax [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: fmax [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmax_v32f16:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: mov x8, #16
+; VBITS_EQ_256-NEXT: ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fmax z0.h, p0/m, z0.h, z2.h
+; VBITS_EQ_256-NEXT: fmax z1.h, p0/m, z1.h, z3.h
+; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmax_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fmax z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%res = call <32 x half> @llvm.maximum.v32f16(<32 x half> %op1, <32 x half> %op2)
@@ -639,14 +671,15 @@ define void @fmax_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
ret void
}
-define void @fmax_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fmax_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fmax_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%res = call <64 x half> @llvm.maximum.v64f16(<64 x half> %op1, <64 x half> %op2)
@@ -654,14 +687,15 @@ define void @fmax_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
ret void
}
-define void @fmax_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fmax_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fmax_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%res = call <128 x half> @llvm.maximum.v128f16(<128 x half> %op1, <128 x half> %op2)
@@ -670,31 +704,34 @@ define void @fmax_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmax_v2f32:
-; CHECK: fmax v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmax_v4f32:
-; CHECK: fmax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2)
ret <4 x float> %res
}
-define void @fmax_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fmax_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fmax_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%op2 = load <8 x float>, <8 x float>* %b
%res = call <8 x float> @llvm.maximum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -703,26 +740,28 @@ define void @fmax_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
}
define void @fmax_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fmax_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fmax [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: fmax [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmax_v16f32:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: mov x8, #8
+; VBITS_EQ_256-NEXT: ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fmax z0.s, p0/m, z0.s, z2.s
+; VBITS_EQ_256-NEXT: fmax z1.s, p0/m, z1.s, z3.s
+; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmax_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fmax z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%res = call <16 x float> @llvm.maximum.v16f32(<16 x float> %op1, <16 x float> %op2)
@@ -730,14 +769,15 @@ define void @fmax_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @fmax_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fmax_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fmax_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%res = call <32 x float> @llvm.maximum.v32f32(<32 x float> %op1, <32 x float> %op2)
@@ -745,14 +785,15 @@ define void @fmax_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
ret void
}
-define void @fmax_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fmax_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fmax_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%res = call <64 x float> @llvm.maximum.v64f32(<64 x float> %op1, <64 x float> %op2)
@@ -761,31 +802,34 @@ define void @fmax_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmax_v1f64:
-; CHECK: fmax d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmax d0, d0, d1
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmax_v2f64:
-; CHECK: fmax v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmax v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2)
ret <2 x double> %res
}
-define void @fmax_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fmax_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fmax_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fmax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
%res = call <4 x double> @llvm.maximum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -794,26 +838,28 @@ define void @fmax_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
}
define void @fmax_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fmax_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fmax [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: fmax [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmax_v8f64:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: mov x8, #4
+; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fmax z0.d, p0/m, z0.d, z2.d
+; VBITS_EQ_256-NEXT: fmax z1.d, p0/m, z1.d, z3.d
+; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmax_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fmax z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%res = call <8 x double> @llvm.maximum.v8f64(<8 x double> %op1, <8 x double> %op2)
@@ -821,14 +867,15 @@ define void @fmax_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @fmax_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fmax_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fmax_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fmax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%res = call <16 x double> @llvm.maximum.v16f64(<16 x double> %op1, <16 x double> %op2)
@@ -836,14 +883,15 @@ define void @fmax_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
ret void
}
-define void @fmax_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fmax_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fmax_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fmax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%res = call <32 x double> @llvm.maximum.v32f64(<32 x double> %op1, <32 x double> %op2)
@@ -856,31 +904,34 @@ define void @fmax_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmin_v4f16:
-; CHECK: fmin v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmin v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmin_v8f16:
-; CHECK: fmin v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmin v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2)
ret <8 x half> %res
}
-define void @fmin_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fmin_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fmin_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%op2 = load <16 x half>, <16 x half>* %b
%res = call <16 x half> @llvm.minimum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -889,26 +940,28 @@ define void @fmin_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
}
define void @fmin_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fmin_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fmin [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: fmin [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmin_v32f16:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: mov x8, #16
+; VBITS_EQ_256-NEXT: ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fmin z0.h, p0/m, z0.h, z2.h
+; VBITS_EQ_256-NEXT: fmin z1.h, p0/m, z1.h, z3.h
+; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmin_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fmin z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%res = call <32 x half> @llvm.minimum.v32f16(<32 x half> %op1, <32 x half> %op2)
@@ -916,14 +969,15 @@ define void @fmin_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
ret void
}
-define void @fmin_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fmin_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fmin_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%res = call <64 x half> @llvm.minimum.v64f16(<64 x half> %op1, <64 x half> %op2)
@@ -931,14 +985,15 @@ define void @fmin_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
ret void
}
-define void @fmin_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fmin_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fmin_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%res = call <128 x half> @llvm.minimum.v128f16(<128 x half> %op1, <128 x half> %op2)
@@ -947,31 +1002,34 @@ define void @fmin_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmin_v2f32:
-; CHECK: fmin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmin_v4f32:
-; CHECK: fmin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2)
ret <4 x float> %res
}
-define void @fmin_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fmin_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fmin_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%op2 = load <8 x float>, <8 x float>* %b
%res = call <8 x float> @llvm.minimum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -980,26 +1038,28 @@ define void @fmin_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
}
define void @fmin_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fmin_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fmin [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: fmin [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmin_v16f32:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: mov x8, #8
+; VBITS_EQ_256-NEXT: ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fmin z0.s, p0/m, z0.s, z2.s
+; VBITS_EQ_256-NEXT: fmin z1.s, p0/m, z1.s, z3.s
+; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmin_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fmin z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%res = call <16 x float> @llvm.minimum.v16f32(<16 x float> %op1, <16 x float> %op2)
@@ -1007,14 +1067,15 @@ define void @fmin_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @fmin_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fmin_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fmin_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%res = call <32 x float> @llvm.minimum.v32f32(<32 x float> %op1, <32 x float> %op2)
@@ -1022,14 +1083,15 @@ define void @fmin_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
ret void
}
-define void @fmin_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fmin_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fmin_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%res = call <64 x float> @llvm.minimum.v64f32(<64 x float> %op1, <64 x float> %op2)
@@ -1038,31 +1100,34 @@ define void @fmin_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmin_v1f64:
-; CHECK: fmin d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmin d0, d0, d1
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fmin_v2f64:
-; CHECK: fmin v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmin v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2)
ret <2 x double> %res
}
-define void @fmin_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fmin_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fmin_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fmin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
%res = call <4 x double> @llvm.minimum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -1071,26 +1136,28 @@ define void @fmin_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
}
define void @fmin_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fmin_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fmin [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: fmin [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmin_v8f64:
+; VBITS_EQ_256: // %bb.0:
+; VBITS_EQ_256-NEXT: mov x8, #4
+; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_EQ_256-NEXT: fmin z0.d, p0/m, z0.d, z2.d
+; VBITS_EQ_256-NEXT: fmin z1.d, p0/m, z1.d, z3.d
+; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_EQ_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmin_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: fmin z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%res = call <8 x double> @llvm.minimum.v8f64(<8 x double> %op1, <8 x double> %op2)
@@ -1098,14 +1165,15 @@ define void @fmin_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @fmin_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fmin_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fmin_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fmin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%res = call <16 x double> @llvm.minimum.v16f64(<16 x double> %op1, <16 x double> %op2)
@@ -1113,14 +1181,15 @@ define void @fmin_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
ret void
}
-define void @fmin_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fmin_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fmin_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fmin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%res = call <32 x double> @llvm.minimum.v32f64(<32 x double> %op1, <32 x double> %op2)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
index 55cc1ebc8114..fc6fb7c85dce 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
@@ -1,243 +1,297 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; FADDA
;
; No single instruction NEON support. Use SVE.
-define half @fadda_v4f16(half %start, <4 x half> %a) #0 {
+define half @fadda_v4f16(half %start, <4 x half> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fadda_v4f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK-NEXT: fadda h0, [[PG]], h0, z1.h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: fadda h0, p0, h0, z1.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
%res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
ret half %res
}
; No single instruction NEON support. Use SVE.
-define half @fadda_v8f16(half %start, <8 x half> %a) #0 {
+define half @fadda_v8f16(half %start, <8 x half> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fadda_v8f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK-NEXT: fadda h0, [[PG]], h0, z1.h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: fadda h0, p0, h0, z1.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
%res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
ret half %res
}
-define half @fadda_v16f16(half %start, <16 x half>* %a) #0 {
+define half @fadda_v16f16(half %start, <16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fadda_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: fadda h0, [[PG]], h0, [[OP]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: fadda h0, p0, h0, z1.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
ret half %res
}
define half @fadda_v32f16(half %start, <32 x half>* %a) #0 {
-; CHECK-LABEL: fadda_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fadda h0, [[PG]], h0, [[OP]].h
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[LO]].h
-; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[HI]].h
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fadda_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fadda h0, p0, h0, z2.h
+; VBITS_GE_256-NEXT: fadda h0, p0, h0, z1.h
+; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fadda_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 def $z0
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fadda h0, p0, h0, z1.h
+; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
ret half %res
}
-define half @fadda_v64f16(half %start, <64 x half>* %a) #0 {
+define half @fadda_v64f16(half %start, <64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fadda_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fadda h0, [[PG]], h0, [[OP]].h
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: fadda h0, p0, h0, z1.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
ret half %res
}
-define half @fadda_v128f16(half %start, <128 x half>* %a) #0 {
+define half @fadda_v128f16(half %start, <128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fadda_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fadda h0, [[PG]], h0, [[OP]].h
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: fadda h0, p0, h0, z1.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
ret half %res
}
; No single instruction NEON support. Use SVE.
-define float @fadda_v2f32(float %start, <2 x float> %a) #0 {
+define float @fadda_v2f32(float %start, <2 x float> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fadda_v2f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK-NEXT: fadda s0, [[PG]], s0, z1.s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: fadda s0, p0, s0, z1.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
%res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
ret float %res
}
; No single instruction NEON support. Use SVE.
-define float @fadda_v4f32(float %start, <4 x float> %a) #0 {
+define float @fadda_v4f32(float %start, <4 x float> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fadda_v4f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK-NEXT: fadda s0, [[PG]], s0, z1.s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: fadda s0, p0, s0, z1.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
%res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
ret float %res
}
-define float @fadda_v8f32(float %start, <8 x float>* %a) #0 {
+define float @fadda_v8f32(float %start, <8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fadda_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: fadda s0, [[PG]], s0, [[OP]].s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: fadda s0, p0, s0, z1.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
ret float %res
}
define float @fadda_v16f32(float %start, <16 x float>* %a) #0 {
-; CHECK-LABEL: fadda_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fadda s0, [[PG]], s0, [[OP]].s
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[LO]].s
-; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[HI]].s
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fadda_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fadda s0, p0, s0, z2.s
+; VBITS_GE_256-NEXT: fadda s0, p0, s0, z1.s
+; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fadda_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 def $z0
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fadda s0, p0, s0, z1.s
+; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
ret float %res
}
-define float @fadda_v32f32(float %start, <32 x float>* %a) #0 {
+define float @fadda_v32f32(float %start, <32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fadda_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fadda s0, [[PG]], s0, [[OP]].s
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: fadda s0, p0, s0, z1.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
ret float %res
}
-define float @fadda_v64f32(float %start, <64 x float>* %a) #0 {
+define float @fadda_v64f32(float %start, <64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fadda_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fadda s0, [[PG]], s0, [[OP]].s
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: fadda s0, p0, s0, z1.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
ret float %res
}
; No single instruction NEON support. Use SVE.
-define double @fadda_v1f64(double %start, <1 x double> %a) #0 {
+define double @fadda_v1f64(double %start, <1 x double> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fadda_v1f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: fadda d0, [[PG]], d0, z1.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: fadda d0, p0, d0, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
ret double %res
}
; No single instruction NEON support. Use SVE.
-define double @fadda_v2f64(double %start, <2 x double> %a) #0 {
+define double @fadda_v2f64(double %start, <2 x double> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fadda_v2f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: fadda d0, [[PG]], d0, z1.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: fadda d0, p0, d0, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
ret double %res
}
-define double @fadda_v4f64(double %start, <4 x double>* %a) #0 {
+define double @fadda_v4f64(double %start, <4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fadda_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: fadda d0, [[PG]], d0, [[OP]].d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: fadda d0, p0, d0, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
ret double %res
}
define double @fadda_v8f64(double %start, <8 x double>* %a) #0 {
-; CHECK-LABEL: fadda_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fadda d0, [[PG]], d0, [[OP]].d
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[LO]].d
-; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[HI]].d
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fadda_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fadda d0, p0, d0, z2.d
+; VBITS_GE_256-NEXT: fadda d0, p0, d0, z1.d
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fadda_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fadda d0, p0, d0, z1.d
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
ret double %res
}
-define double @fadda_v16f64(double %start, <16 x double>* %a) #0 {
+define double @fadda_v16f64(double %start, <16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fadda_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fadda d0, [[PG]], d0, [[OP]].d
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: fadda d0, p0, d0, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
ret double %res
}
-define double @fadda_v32f64(double %start, <32 x double>* %a) #0 {
+define double @fadda_v32f64(double %start, <32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fadda_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fadda d0, [[PG]], d0, [[OP]].d
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: fadda d0, p0, d0, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
ret double %res
@@ -248,236 +302,260 @@ define double @fadda_v32f64(double %start, <32 x double>* %a) #0 {
;
; No single instruction NEON support for 4 element vectors.
-define half @faddv_v4f16(half %start, <4 x half> %a) #0 {
+define half @faddv_v4f16(half %start, <4 x half> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: faddv_v4f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], z1.h
-; CHECK-NEXT: fadd h0, h0, [[RDX]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: faddv h1, p0, z1.h
+; CHECK-NEXT: fadd h0, h0, h1
+; CHECK-NEXT: ret
%res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
ret half %res
}
; No single instruction NEON support for 8 element vectors.
-define half @faddv_v8f16(half %start, <8 x half> %a) #0 {
+define half @faddv_v8f16(half %start, <8 x half> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: faddv_v8f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], z1.h
-; CHECK-NEXT: fadd h0, h0, [[RDX]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: faddv h1, p0, z1.h
+; CHECK-NEXT: fadd h0, h0, h1
+; CHECK-NEXT: ret
%res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
ret half %res
}
-define half @faddv_v16f16(half %start, <16 x half>* %a) #0 {
+define half @faddv_v16f16(half %start, <16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: faddv_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fadd h0, h0, [[RDX]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: faddv h1, p0, z1.h
+; CHECK-NEXT: fadd h0, h0, h1
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
ret half %res
}
define half @faddv_v32f16(half %start, <32 x half>* %a) #0 {
-; CHECK-LABEL: faddv_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fadd h0, h0, [[RDX]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: faddv h1, [[PG]], [[ADD]].h
-; VBITS_EQ_256-DAG: fadd h0, h0, [[RDX]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: faddv_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fadd z1.h, p0/m, z1.h, z2.h
+; VBITS_GE_256-NEXT: faddv h1, p0, z1.h
+; VBITS_GE_256-NEXT: fadd h0, h0, h1
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: faddv_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: faddv h1, p0, z1.h
+; VBITS_GE_512-NEXT: fadd h0, h0, h1
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call fast half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
ret half %res
}
-define half @faddv_v64f16(half %start, <64 x half>* %a) #0 {
+define half @faddv_v64f16(half %start, <64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: faddv_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fadd h0, h0, [[RDX]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: faddv h1, p0, z1.h
+; CHECK-NEXT: fadd h0, h0, h1
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call fast half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
ret half %res
}
-define half @faddv_v128f16(half %start, <128 x half>* %a) #0 {
+define half @faddv_v128f16(half %start, <128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: faddv_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fadd h0, h0, [[RDX]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: faddv h1, p0, z1.h
+; CHECK-NEXT: fadd h0, h0, h1
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call fast half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
ret half %res
}
; Don't use SVE for 2 element vectors.
-define float @faddv_v2f32(float %start, <2 x float> %a) #0 {
+define float @faddv_v2f32(float %start, <2 x float> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: faddv_v2f32:
-; CHECK: faddp s1, v1.2s
-; CHECK-NEXT: fadd s0, s0, s1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: faddp s1, v1.2s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
%res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
ret float %res
}
; No single instruction NEON support for 4 element vectors.
-define float @faddv_v4f32(float %start, <4 x float> %a) #0 {
+define float @faddv_v4f32(float %start, <4 x float> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: faddv_v4f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], z1.s
-; CHECK-NEXT: fadd s0, s0, [[RDX]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: faddv s1, p0, z1.s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
%res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
ret float %res
}
-define float @faddv_v8f32(float %start, <8 x float>* %a) #0 {
+define float @faddv_v8f32(float %start, <8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: faddv_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fadd s0, s0, [[RDX]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: faddv s1, p0, z1.s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
ret float %res
}
define float @faddv_v16f32(float %start, <16 x float>* %a) #0 {
-; CHECK-LABEL: faddv_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fadd s0, s0, [[RDX]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: faddv [[RDX:s[0-9]+]], [[PG]], [[ADD]].s
-; VBITS_EQ_256-DAG: fadd s0, s0, [[RDX]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: faddv_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fadd z1.s, p0/m, z1.s, z2.s
+; VBITS_GE_256-NEXT: faddv s1, p0, z1.s
+; VBITS_GE_256-NEXT: fadd s0, s0, s1
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: faddv_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: faddv s1, p0, z1.s
+; VBITS_GE_512-NEXT: fadd s0, s0, s1
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call fast float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
ret float %res
}
-define float @faddv_v32f32(float %start, <32 x float>* %a) #0 {
+define float @faddv_v32f32(float %start, <32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: faddv_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fadd s0, s0, [[RDX]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: faddv s1, p0, z1.s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call fast float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
ret float %res
}
-define float @faddv_v64f32(float %start, <64 x float>* %a) #0 {
+define float @faddv_v64f32(float %start, <64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: faddv_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fadd s0, s0, [[RDX]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: faddv s1, p0, z1.s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call fast float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
ret float %res
}
; Don't use SVE for 1 element vectors.
-define double @faddv_v1f64(double %start, <1 x double> %a) #0 {
+define double @faddv_v1f64(double %start, <1 x double> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: faddv_v1f64:
-; CHECK: fadd d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fadd d0, d0, d1
+; CHECK-NEXT: ret
%res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
ret double %res
}
; Don't use SVE for 2 element vectors.
-define double @faddv_v2f64(double %start, <2 x double> %a) #0 {
+define double @faddv_v2f64(double %start, <2 x double> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: faddv_v2f64:
-; CHECK: faddp d1, v1.2d
-; CHECK-NEXT: fadd d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: faddp d1, v1.2d
+; CHECK-NEXT: fadd d0, d0, d1
+; CHECK-NEXT: ret
%res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
ret double %res
}
-define double @faddv_v4f64(double %start, <4 x double>* %a) #0 {
+define double @faddv_v4f64(double %start, <4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: faddv_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fadd d0, d0, [[RDX]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: faddv d1, p0, z1.d
+; CHECK-NEXT: fadd d0, d0, d1
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
ret double %res
}
define double @faddv_v8f64(double %start, <8 x double>* %a) #0 {
-; CHECK-LABEL: faddv_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fadd d0, d0, [[RDX]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: faddv [[RDX:d[0-9]+]], [[PG]], [[ADD]].d
-; VBITS_EQ_256-DAG: fadd d0, d0, [[RDX]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: faddv_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fadd z1.d, p0/m, z1.d, z2.d
+; VBITS_GE_256-NEXT: faddv d1, p0, z1.d
+; VBITS_GE_256-NEXT: fadd d0, d0, d1
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: faddv_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: faddv d1, p0, z1.d
+; VBITS_GE_512-NEXT: fadd d0, d0, d1
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call fast double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
ret double %res
}
-define double @faddv_v16f64(double %start, <16 x double>* %a) #0 {
+define double @faddv_v16f64(double %start, <16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: faddv_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fadd d0, d0, [[RDX]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: faddv d1, p0, z1.d
+; CHECK-NEXT: fadd d0, d0, d1
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call fast double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
ret double %res
}
-define double @faddv_v32f64(double %start, <32 x double>* %a) #0 {
+define double @faddv_v32f64(double %start, <32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: faddv_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fadd d0, d0, [[RDX]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: faddv d1, p0, z1.d
+; CHECK-NEXT: fadd d0, d0, d1
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call fast double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
ret double %res
@@ -488,213 +566,248 @@ define double @faddv_v32f64(double %start, <32 x double>* %a) #0 {
;
; No NEON 16-bit vector FMAXNMV support. Use SVE.
-define half @fmaxv_v4f16(<4 x half> %a) #0 {
+define half @fmaxv_v4f16(<4 x half> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxv_v4f16:
-; CHECK: fmaxnmv h0, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmaxnmv h0, v0.4h
+; CHECK-NEXT: ret
%res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
ret half %res
}
; No NEON 16-bit vector FMAXNMV support. Use SVE.
-define half @fmaxv_v8f16(<8 x half> %a) #0 {
+define half @fmaxv_v8f16(<8 x half> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxv_v8f16:
-; CHECK: fmaxnmv h0, v0.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmaxnmv h0, v0.8h
+; CHECK-NEXT: ret
%res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
ret half %res
}
-define half @fmaxv_v16f16(<16 x half>* %a) #0 {
+define half @fmaxv_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxv_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fmaxnmv h0, p0, z0.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
ret half %res
}
define half @fmaxv_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: fmaxv_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: fmaxnmv h0, [[PG]], [[MAX]].h
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fmaxv_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT: fmaxnmv h0, p0, z0.h
+; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmaxv_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fmaxnmv h0, p0, z0.h
+; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %op)
ret half %res
}
-define half @fmaxv_v64f16(<64 x half>* %a) #0 {
+define half @fmaxv_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fmaxv_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fmaxnmv h0, p0, z0.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call half @llvm.vector.reduce.fmax.v64f16(<64 x half> %op)
ret half %res
}
-define half @fmaxv_v128f16(<128 x half>* %a) #0 {
+define half @fmaxv_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fmaxv_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fmaxnmv h0, p0, z0.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %op)
ret half %res
}
; Don't use SVE for 64-bit f32 vectors.
-define float @fmaxv_v2f32(<2 x float> %a) #0 {
+define float @fmaxv_v2f32(<2 x float> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxv_v2f32:
-; CHECK: fmaxnmp s0, v0.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmaxnmp s0, v0.2s
+; CHECK-NEXT: ret
%res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
ret float %res
}
; Don't use SVE for 128-bit f32 vectors.
-define float @fmaxv_v4f32(<4 x float> %a) #0 {
+define float @fmaxv_v4f32(<4 x float> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxv_v4f32:
-; CHECK: fmaxnmv s0, v0.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmaxnmv s0, v0.4s
+; CHECK-NEXT: ret
%res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
ret float %res
}
-define float @fmaxv_v8f32(<8 x float>* %a) #0 {
+define float @fmaxv_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxv_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fmaxnmv s0, p0, z0.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
ret float %res
}
define float @fmaxv_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: fmaxv_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: fmaxnmv s0, [[PG]], [[MAX]].s
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fmaxv_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: fmaxnmv s0, p0, z0.s
+; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmaxv_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fmaxnmv s0, p0, z0.s
+; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %op)
ret float %res
}
-define float @fmaxv_v32f32(<32 x float>* %a) #0 {
+define float @fmaxv_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fmaxv_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fmaxnmv s0, p0, z0.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %op)
ret float %res
}
-define float @fmaxv_v64f32(<64 x float>* %a) #0 {
+define float @fmaxv_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fmaxv_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fmaxnmv s0, p0, z0.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call float @llvm.vector.reduce.fmax.v64f32(<64 x float> %op)
ret float %res
}
; Nothing to do for single element vectors.
-define double @fmaxv_v1f64(<1 x double> %a) #0 {
+define double @fmaxv_v1f64(<1 x double> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxv_v1f64:
-; CHECK-NOT: fmax
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
%res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
ret double %res
}
; Don't use SVE for 128-bit f64 vectors.
-define double @fmaxv_v2f64(<2 x double> %a) #0 {
+define double @fmaxv_v2f64(<2 x double> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxv_v2f64:
-; CHECK: fmaxnmp d0, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmaxnmp d0, v0.2d
+; CHECK-NEXT: ret
%res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
ret double %res
}
-define double @fmaxv_v4f64(<4 x double>* %a) #0 {
+define double @fmaxv_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fmaxv_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fmaxnmv d0, p0, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
ret double %res
}
define double @fmaxv_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: fmaxv_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: fmaxnmv d0, [[PG]], [[MAX]].d
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fmaxv_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_256-NEXT: fmaxnmv d0, p0, z0.d
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fmaxv_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fmaxnmv d0, p0, z0.d
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %op)
ret double %res
}
-define double @fmaxv_v16f64(<16 x double>* %a) #0 {
+define double @fmaxv_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fmaxv_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fmaxnmv d0, p0, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %op)
ret double %res
}
-define double @fmaxv_v32f64(<32 x double>* %a) #0 {
+define double @fmaxv_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fmaxv_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fmaxnmv d0, p0, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %op)
ret double %res
@@ -705,213 +818,248 @@ define double @fmaxv_v32f64(<32 x double>* %a) #0 {
;
; No NEON 16-bit vector FMINNMV support. Use SVE.
-define half @fminv_v4f16(<4 x half> %a) #0 {
+define half @fminv_v4f16(<4 x half> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fminv_v4f16:
-; CHECK: fminnmv h0, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnmv h0, v0.4h
+; CHECK-NEXT: ret
%res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
ret half %res
}
; No NEON 16-bit vector FMINNMV support. Use SVE.
-define half @fminv_v8f16(<8 x half> %a) #0 {
+define half @fminv_v8f16(<8 x half> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fminv_v8f16:
-; CHECK: fminnmv h0, v0.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnmv h0, v0.8h
+; CHECK-NEXT: ret
%res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
ret half %res
}
-define half @fminv_v16f16(<16 x half>* %a) #0 {
+define half @fminv_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fminv_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: fminnmv h0, [[PG]], [[OP]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fminnmv h0, p0, z0.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
ret half %res
}
define half @fminv_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: fminv_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fminnmv h0, [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: fminnmv h0, [[PG]], [[MIN]].h
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fminv_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT: fminnmv h0, p0, z0.h
+; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fminv_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fminnmv h0, p0, z0.h
+; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %op)
ret half %res
}
-define half @fminv_v64f16(<64 x half>* %a) #0 {
+define half @fminv_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fminv_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fminnmv h0, [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fminnmv h0, p0, z0.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call half @llvm.vector.reduce.fmin.v64f16(<64 x half> %op)
ret half %res
}
-define half @fminv_v128f16(<128 x half>* %a) #0 {
+define half @fminv_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fminv_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fminnmv h0, [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fminnmv h0, p0, z0.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %op)
ret half %res
}
; Don't use SVE for 64-bit f32 vectors.
-define float @fminv_v2f32(<2 x float> %a) #0 {
+define float @fminv_v2f32(<2 x float> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fminv_v2f32:
-; CHECK: fminnmp s0, v0.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnmp s0, v0.2s
+; CHECK-NEXT: ret
%res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
ret float %res
}
; Don't use SVE for 128-bit f32 vectors.
-define float @fminv_v4f32(<4 x float> %a) #0 {
+define float @fminv_v4f32(<4 x float> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fminv_v4f32:
-; CHECK: fminnmv s0, v0.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnmv s0, v0.4s
+; CHECK-NEXT: ret
%res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
ret float %res
}
-define float @fminv_v8f32(<8 x float>* %a) #0 {
+define float @fminv_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fminv_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: fminnmv s0, [[PG]], [[OP]].s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fminnmv s0, p0, z0.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
ret float %res
}
define float @fminv_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: fminv_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fminnmv s0, [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: fminnmv s0, [[PG]], [[MIN]].s
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fminv_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: fminnmv s0, p0, z0.s
+; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fminv_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fminnmv s0, p0, z0.s
+; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %op)
ret float %res
}
-define float @fminv_v32f32(<32 x float>* %a) #0 {
+define float @fminv_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fminv_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fminnmv s0, [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fminnmv s0, p0, z0.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %op)
ret float %res
}
-define float @fminv_v64f32(<64 x float>* %a) #0 {
+define float @fminv_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fminv_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fminnmv s0, [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fminnmv s0, p0, z0.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call float @llvm.vector.reduce.fmin.v64f32(<64 x float> %op)
ret float %res
}
; Nothing to do for single element vectors.
-define double @fminv_v1f64(<1 x double> %a) #0 {
+define double @fminv_v1f64(<1 x double> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fminv_v1f64:
-; CHECK-NOT: fmin
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
%res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
ret double %res
}
; Don't use SVE for 128-bit f64 vectors.
-define double @fminv_v2f64(<2 x double> %a) #0 {
+define double @fminv_v2f64(<2 x double> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fminv_v2f64:
-; CHECK: fminnmp d0, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnmp d0, v0.2d
+; CHECK-NEXT: ret
%res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
ret double %res
}
-define double @fminv_v4f64(<4 x double>* %a) #0 {
+define double @fminv_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fminv_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: fminnmv d0, [[PG]], [[OP]].d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fminnmv d0, p0, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
ret double %res
}
define double @fminv_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: fminv_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fminnmv d0, [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: fminnmv d0, [[PG]], [[MIN]].d
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fminv_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_256-NEXT: fminnmv d0, p0, z0.d
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fminv_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fminnmv d0, p0, z0.d
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %op)
ret double %res
}
-define double @fminv_v16f64(<16 x double>* %a) #0 {
+define double @fminv_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: fminv_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fminnmv d0, [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fminnmv d0, p0, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %op)
ret double %res
}
-define double @fminv_v32f64(<32 x double>* %a) #0 {
+define double @fminv_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: fminv_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fminnmv d0, [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fminnmv d0, p0, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %op)
ret double %res
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
index e9a0c3658a3d..0f5afa5b17b3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
@@ -1,54 +1,42 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; CEIL -> FRINTP
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @frintp_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frintp_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintp_v4f16:
-; CHECK: frintp v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintp v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.ceil.v4f16(<4 x half> %op)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @frintp_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frintp_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintp_v8f16:
-; CHECK: frintp v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintp v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.ceil.v8f16(<8 x half> %op)
ret <8 x half> %res
}
-define void @frintp_v16f16(<16 x half>* %a) #0 {
+define void @frintp_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintp_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintp [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintp z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call <16 x half> @llvm.ceil.v16f16(<16 x half> %op)
store <16 x half> %res, <16 x half>* %a
@@ -56,49 +44,53 @@ define void @frintp_v16f16(<16 x half>* %a) #0 {
}
define void @frintp_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frintp_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintp [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frintp [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frintp [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintp_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintp z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: frintp z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintp_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintp z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call <32 x half> @llvm.ceil.v32f16(<32 x half> %op)
store <32 x half> %res, <32 x half>* %a
ret void
}
-define void @frintp_v64f16(<64 x half>* %a) #0 {
+define void @frintp_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintp_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintp [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintp z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call <64 x half> @llvm.ceil.v64f16(<64 x half> %op)
store <64 x half> %res, <64 x half>* %a
ret void
}
-define void @frintp_v128f16(<128 x half>* %a) #0 {
+define void @frintp_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintp_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintp [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintp z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call <128 x half> @llvm.ceil.v128f16(<128 x half> %op)
store <128 x half> %res, <128 x half>* %a
@@ -106,30 +98,33 @@ define void @frintp_v128f16(<128 x half>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @frintp_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frintp_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintp_v2f32:
-; CHECK: frintp v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintp v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.ceil.v2f32(<2 x float> %op)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @frintp_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frintp_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintp_v4f32:
-; CHECK: frintp v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintp v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %op)
ret <4 x float> %res
}
-define void @frintp_v8f32(<8 x float>* %a) #0 {
+define void @frintp_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintp_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintp [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintp z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call <8 x float> @llvm.ceil.v8f32(<8 x float> %op)
store <8 x float> %res, <8 x float>* %a
@@ -137,49 +132,53 @@ define void @frintp_v8f32(<8 x float>* %a) #0 {
}
define void @frintp_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frintp_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintp [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frintp [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frintp [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintp_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintp z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: frintp z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintp_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintp z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call <16 x float> @llvm.ceil.v16f32(<16 x float> %op)
store <16 x float> %res, <16 x float>* %a
ret void
}
-define void @frintp_v32f32(<32 x float>* %a) #0 {
+define void @frintp_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintp_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintp [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintp z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call <32 x float> @llvm.ceil.v32f32(<32 x float> %op)
store <32 x float> %res, <32 x float>* %a
ret void
}
-define void @frintp_v64f32(<64 x float>* %a) #0 {
+define void @frintp_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintp_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintp [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintp z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call <64 x float> @llvm.ceil.v64f32(<64 x float> %op)
store <64 x float> %res, <64 x float>* %a
@@ -187,30 +186,33 @@ define void @frintp_v64f32(<64 x float>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @frintp_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frintp_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintp_v1f64:
-; CHECK: frintp d0, d0
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintp d0, d0
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.ceil.v1f64(<1 x double> %op)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @frintp_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frintp_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintp_v2f64:
-; CHECK: frintp v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintp v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.ceil.v2f64(<2 x double> %op)
ret <2 x double> %res
}
-define void @frintp_v4f64(<4 x double>* %a) #0 {
+define void @frintp_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintp_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintp [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintp z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call <4 x double> @llvm.ceil.v4f64(<4 x double> %op)
store <4 x double> %res, <4 x double>* %a
@@ -218,49 +220,53 @@ define void @frintp_v4f64(<4 x double>* %a) #0 {
}
define void @frintp_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frintp_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintp [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frintp [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frintp [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintp_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintp z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: frintp z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintp_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintp z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call <8 x double> @llvm.ceil.v8f64(<8 x double> %op)
store <8 x double> %res, <8 x double>* %a
ret void
}
-define void @frintp_v16f64(<16 x double>* %a) #0 {
+define void @frintp_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintp_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintp [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintp z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call <16 x double> @llvm.ceil.v16f64(<16 x double> %op)
store <16 x double> %res, <16 x double>* %a
ret void
}
-define void @frintp_v32f64(<32 x double>* %a) #0 {
+define void @frintp_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintp_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintp [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintp z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call <32 x double> @llvm.ceil.v32f64(<32 x double> %op)
store <32 x double> %res, <32 x double>* %a
@@ -272,30 +278,33 @@ define void @frintp_v32f64(<32 x double>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @frintm_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frintm_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintm_v4f16:
-; CHECK: frintm v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintm v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.floor.v4f16(<4 x half> %op)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @frintm_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frintm_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintm_v8f16:
-; CHECK: frintm v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintm v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.floor.v8f16(<8 x half> %op)
ret <8 x half> %res
}
-define void @frintm_v16f16(<16 x half>* %a) #0 {
+define void @frintm_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintm_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintm z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call <16 x half> @llvm.floor.v16f16(<16 x half> %op)
store <16 x half> %res, <16 x half>* %a
@@ -303,49 +312,53 @@ define void @frintm_v16f16(<16 x half>* %a) #0 {
}
define void @frintm_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frintm_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frintm [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frintm [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintm_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintm z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: frintm z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintm_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintm z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call <32 x half> @llvm.floor.v32f16(<32 x half> %op)
store <32 x half> %res, <32 x half>* %a
ret void
}
-define void @frintm_v64f16(<64 x half>* %a) #0 {
+define void @frintm_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintm_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintm z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call <64 x half> @llvm.floor.v64f16(<64 x half> %op)
store <64 x half> %res, <64 x half>* %a
ret void
}
-define void @frintm_v128f16(<128 x half>* %a) #0 {
+define void @frintm_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintm_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintm z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call <128 x half> @llvm.floor.v128f16(<128 x half> %op)
store <128 x half> %res, <128 x half>* %a
@@ -353,30 +366,33 @@ define void @frintm_v128f16(<128 x half>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @frintm_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frintm_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintm_v2f32:
-; CHECK: frintm v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintm v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.floor.v2f32(<2 x float> %op)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @frintm_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frintm_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintm_v4f32:
-; CHECK: frintm v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintm v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.floor.v4f32(<4 x float> %op)
ret <4 x float> %res
}
-define void @frintm_v8f32(<8 x float>* %a) #0 {
+define void @frintm_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintm_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintm z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call <8 x float> @llvm.floor.v8f32(<8 x float> %op)
store <8 x float> %res, <8 x float>* %a
@@ -384,49 +400,53 @@ define void @frintm_v8f32(<8 x float>* %a) #0 {
}
define void @frintm_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frintm_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frintm [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frintm [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintm_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintm z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: frintm z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintm_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintm z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call <16 x float> @llvm.floor.v16f32(<16 x float> %op)
store <16 x float> %res, <16 x float>* %a
ret void
}
-define void @frintm_v32f32(<32 x float>* %a) #0 {
+define void @frintm_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintm_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintm z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call <32 x float> @llvm.floor.v32f32(<32 x float> %op)
store <32 x float> %res, <32 x float>* %a
ret void
}
-define void @frintm_v64f32(<64 x float>* %a) #0 {
+define void @frintm_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintm_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintm z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call <64 x float> @llvm.floor.v64f32(<64 x float> %op)
store <64 x float> %res, <64 x float>* %a
@@ -434,30 +454,33 @@ define void @frintm_v64f32(<64 x float>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @frintm_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frintm_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintm_v1f64:
-; CHECK: frintm d0, d0
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintm d0, d0
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.floor.v1f64(<1 x double> %op)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @frintm_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frintm_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintm_v2f64:
-; CHECK: frintm v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintm v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.floor.v2f64(<2 x double> %op)
ret <2 x double> %res
}
-define void @frintm_v4f64(<4 x double>* %a) #0 {
+define void @frintm_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintm_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintm z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call <4 x double> @llvm.floor.v4f64(<4 x double> %op)
store <4 x double> %res, <4 x double>* %a
@@ -465,49 +488,53 @@ define void @frintm_v4f64(<4 x double>* %a) #0 {
}
define void @frintm_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frintm_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frintm [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frintm [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintm_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintm z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: frintm z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintm_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintm z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call <8 x double> @llvm.floor.v8f64(<8 x double> %op)
store <8 x double> %res, <8 x double>* %a
ret void
}
-define void @frintm_v16f64(<16 x double>* %a) #0 {
+define void @frintm_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintm_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintm z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call <16 x double> @llvm.floor.v16f64(<16 x double> %op)
store <16 x double> %res, <16 x double>* %a
ret void
}
-define void @frintm_v32f64(<32 x double>* %a) #0 {
+define void @frintm_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintm_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintm z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call <32 x double> @llvm.floor.v32f64(<32 x double> %op)
store <32 x double> %res, <32 x double>* %a
@@ -519,30 +546,33 @@ define void @frintm_v32f64(<32 x double>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @frinti_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frinti_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frinti_v4f16:
-; CHECK: frinti v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frinti v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %op)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @frinti_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frinti_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frinti_v8f16:
-; CHECK: frinti v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frinti v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %op)
ret <8 x half> %res
}
-define void @frinti_v16f16(<16 x half>* %a) #0 {
+define void @frinti_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frinti_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frinti [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frinti z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %op)
store <16 x half> %res, <16 x half>* %a
@@ -550,49 +580,53 @@ define void @frinti_v16f16(<16 x half>* %a) #0 {
}
define void @frinti_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frinti_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frinti [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frinti [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frinti [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frinti_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frinti z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: frinti z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frinti_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frinti z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call <32 x half> @llvm.nearbyint.v32f16(<32 x half> %op)
store <32 x half> %res, <32 x half>* %a
ret void
}
-define void @frinti_v64f16(<64 x half>* %a) #0 {
+define void @frinti_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frinti_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frinti [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frinti z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call <64 x half> @llvm.nearbyint.v64f16(<64 x half> %op)
store <64 x half> %res, <64 x half>* %a
ret void
}
-define void @frinti_v128f16(<128 x half>* %a) #0 {
+define void @frinti_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frinti_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frinti [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frinti z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call <128 x half> @llvm.nearbyint.v128f16(<128 x half> %op)
store <128 x half> %res, <128 x half>* %a
@@ -600,30 +634,33 @@ define void @frinti_v128f16(<128 x half>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @frinti_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frinti_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frinti_v2f32:
-; CHECK: frinti v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frinti v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %op)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @frinti_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frinti_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frinti_v4f32:
-; CHECK: frinti v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frinti v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %op)
ret <4 x float> %res
}
-define void @frinti_v8f32(<8 x float>* %a) #0 {
+define void @frinti_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frinti_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frinti [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frinti z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %op)
store <8 x float> %res, <8 x float>* %a
@@ -631,49 +668,53 @@ define void @frinti_v8f32(<8 x float>* %a) #0 {
}
define void @frinti_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frinti_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frinti [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frinti [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frinti [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frinti_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frinti z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: frinti z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frinti_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frinti z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %op)
store <16 x float> %res, <16 x float>* %a
ret void
}
-define void @frinti_v32f32(<32 x float>* %a) #0 {
+define void @frinti_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frinti_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frinti [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frinti z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call <32 x float> @llvm.nearbyint.v32f32(<32 x float> %op)
store <32 x float> %res, <32 x float>* %a
ret void
}
-define void @frinti_v64f32(<64 x float>* %a) #0 {
+define void @frinti_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frinti_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frinti [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frinti z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call <64 x float> @llvm.nearbyint.v64f32(<64 x float> %op)
store <64 x float> %res, <64 x float>* %a
@@ -681,30 +722,33 @@ define void @frinti_v64f32(<64 x float>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @frinti_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frinti_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frinti_v1f64:
-; CHECK: frinti d0, d0
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frinti d0, d0
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %op)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @frinti_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frinti_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frinti_v2f64:
-; CHECK: frinti v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frinti v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %op)
ret <2 x double> %res
}
-define void @frinti_v4f64(<4 x double>* %a) #0 {
+define void @frinti_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frinti_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frinti [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frinti z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %op)
store <4 x double> %res, <4 x double>* %a
@@ -712,49 +756,53 @@ define void @frinti_v4f64(<4 x double>* %a) #0 {
}
define void @frinti_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frinti_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frinti [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frinti [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frinti [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frinti_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frinti z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: frinti z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frinti_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frinti z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %op)
store <8 x double> %res, <8 x double>* %a
ret void
}
-define void @frinti_v16f64(<16 x double>* %a) #0 {
+define void @frinti_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frinti_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frinti [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frinti z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call <16 x double> @llvm.nearbyint.v16f64(<16 x double> %op)
store <16 x double> %res, <16 x double>* %a
ret void
}
-define void @frinti_v32f64(<32 x double>* %a) #0 {
+define void @frinti_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frinti_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frinti [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frinti z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call <32 x double> @llvm.nearbyint.v32f64(<32 x double> %op)
store <32 x double> %res, <32 x double>* %a
@@ -766,30 +814,33 @@ define void @frinti_v32f64(<32 x double>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @frintx_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frintx_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintx_v4f16:
-; CHECK: frintx v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintx v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.rint.v4f16(<4 x half> %op)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @frintx_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frintx_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintx_v8f16:
-; CHECK: frintx v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintx v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.rint.v8f16(<8 x half> %op)
ret <8 x half> %res
}
-define void @frintx_v16f16(<16 x half>* %a) #0 {
+define void @frintx_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintx_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintx [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintx z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call <16 x half> @llvm.rint.v16f16(<16 x half> %op)
store <16 x half> %res, <16 x half>* %a
@@ -797,49 +848,53 @@ define void @frintx_v16f16(<16 x half>* %a) #0 {
}
define void @frintx_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frintx_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintx [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frintx [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frintx [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintx_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintx z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: frintx z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintx_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintx z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call <32 x half> @llvm.rint.v32f16(<32 x half> %op)
store <32 x half> %res, <32 x half>* %a
ret void
}
-define void @frintx_v64f16(<64 x half>* %a) #0 {
+define void @frintx_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintx_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintx [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintx z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call <64 x half> @llvm.rint.v64f16(<64 x half> %op)
store <64 x half> %res, <64 x half>* %a
ret void
}
-define void @frintx_v128f16(<128 x half>* %a) #0 {
+define void @frintx_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintx_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintx [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintx z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call <128 x half> @llvm.rint.v128f16(<128 x half> %op)
store <128 x half> %res, <128 x half>* %a
@@ -847,30 +902,33 @@ define void @frintx_v128f16(<128 x half>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @frintx_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frintx_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintx_v2f32:
-; CHECK: frintx v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintx v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.rint.v2f32(<2 x float> %op)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @frintx_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frintx_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintx_v4f32:
-; CHECK: frintx v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintx v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.rint.v4f32(<4 x float> %op)
ret <4 x float> %res
}
-define void @frintx_v8f32(<8 x float>* %a) #0 {
+define void @frintx_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintx_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintx [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintx z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call <8 x float> @llvm.rint.v8f32(<8 x float> %op)
store <8 x float> %res, <8 x float>* %a
@@ -878,49 +936,53 @@ define void @frintx_v8f32(<8 x float>* %a) #0 {
}
define void @frintx_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frintx_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintx [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frintx [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frintx [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintx_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintx z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: frintx z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintx_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintx z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call <16 x float> @llvm.rint.v16f32(<16 x float> %op)
store <16 x float> %res, <16 x float>* %a
ret void
}
-define void @frintx_v32f32(<32 x float>* %a) #0 {
+define void @frintx_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintx_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintx [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintx z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call <32 x float> @llvm.rint.v32f32(<32 x float> %op)
store <32 x float> %res, <32 x float>* %a
ret void
}
-define void @frintx_v64f32(<64 x float>* %a) #0 {
+define void @frintx_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintx_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintx [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintx z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call <64 x float> @llvm.rint.v64f32(<64 x float> %op)
store <64 x float> %res, <64 x float>* %a
@@ -928,30 +990,33 @@ define void @frintx_v64f32(<64 x float>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @frintx_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frintx_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintx_v1f64:
-; CHECK: frintx d0, d0
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintx d0, d0
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.rint.v1f64(<1 x double> %op)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @frintx_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frintx_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintx_v2f64:
-; CHECK: frintx v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintx v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.rint.v2f64(<2 x double> %op)
ret <2 x double> %res
}
-define void @frintx_v4f64(<4 x double>* %a) #0 {
+define void @frintx_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintx_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintx [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintx z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call <4 x double> @llvm.rint.v4f64(<4 x double> %op)
store <4 x double> %res, <4 x double>* %a
@@ -959,49 +1024,53 @@ define void @frintx_v4f64(<4 x double>* %a) #0 {
}
define void @frintx_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frintx_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintx [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frintx [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frintx [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintx_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintx z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: frintx z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintx_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintx z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call <8 x double> @llvm.rint.v8f64(<8 x double> %op)
store <8 x double> %res, <8 x double>* %a
ret void
}
-define void @frintx_v16f64(<16 x double>* %a) #0 {
+define void @frintx_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintx_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintx [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintx z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call <16 x double> @llvm.rint.v16f64(<16 x double> %op)
store <16 x double> %res, <16 x double>* %a
ret void
}
-define void @frintx_v32f64(<32 x double>* %a) #0 {
+define void @frintx_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintx_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintx [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintx z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call <32 x double> @llvm.rint.v32f64(<32 x double> %op)
store <32 x double> %res, <32 x double>* %a
@@ -1013,30 +1082,33 @@ define void @frintx_v32f64(<32 x double>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @frinta_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frinta_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frinta_v4f16:
-; CHECK: frinta v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frinta v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.round.v4f16(<4 x half> %op)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @frinta_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frinta_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frinta_v8f16:
-; CHECK: frinta v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frinta v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.round.v8f16(<8 x half> %op)
ret <8 x half> %res
}
-define void @frinta_v16f16(<16 x half>* %a) #0 {
+define void @frinta_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frinta_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frinta [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frinta z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call <16 x half> @llvm.round.v16f16(<16 x half> %op)
store <16 x half> %res, <16 x half>* %a
@@ -1044,49 +1116,53 @@ define void @frinta_v16f16(<16 x half>* %a) #0 {
}
define void @frinta_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frinta_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frinta [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frinta [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frinta [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frinta_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frinta z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: frinta z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frinta_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frinta z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call <32 x half> @llvm.round.v32f16(<32 x half> %op)
store <32 x half> %res, <32 x half>* %a
ret void
}
-define void @frinta_v64f16(<64 x half>* %a) #0 {
+define void @frinta_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frinta_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frinta [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frinta z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call <64 x half> @llvm.round.v64f16(<64 x half> %op)
store <64 x half> %res, <64 x half>* %a
ret void
}
-define void @frinta_v128f16(<128 x half>* %a) #0 {
+define void @frinta_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frinta_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frinta [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frinta z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call <128 x half> @llvm.round.v128f16(<128 x half> %op)
store <128 x half> %res, <128 x half>* %a
@@ -1094,30 +1170,33 @@ define void @frinta_v128f16(<128 x half>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @frinta_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frinta_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frinta_v2f32:
-; CHECK: frinta v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frinta v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.round.v2f32(<2 x float> %op)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @frinta_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frinta_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frinta_v4f32:
-; CHECK: frinta v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frinta v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.round.v4f32(<4 x float> %op)
ret <4 x float> %res
}
-define void @frinta_v8f32(<8 x float>* %a) #0 {
+define void @frinta_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frinta_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frinta [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frinta z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call <8 x float> @llvm.round.v8f32(<8 x float> %op)
store <8 x float> %res, <8 x float>* %a
@@ -1125,49 +1204,53 @@ define void @frinta_v8f32(<8 x float>* %a) #0 {
}
define void @frinta_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frinta_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frinta [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frinta [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frinta [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frinta_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frinta z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: frinta z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frinta_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frinta z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call <16 x float> @llvm.round.v16f32(<16 x float> %op)
store <16 x float> %res, <16 x float>* %a
ret void
}
-define void @frinta_v32f32(<32 x float>* %a) #0 {
+define void @frinta_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frinta_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frinta [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frinta z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call <32 x float> @llvm.round.v32f32(<32 x float> %op)
store <32 x float> %res, <32 x float>* %a
ret void
}
-define void @frinta_v64f32(<64 x float>* %a) #0 {
+define void @frinta_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frinta_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frinta [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frinta z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call <64 x float> @llvm.round.v64f32(<64 x float> %op)
store <64 x float> %res, <64 x float>* %a
@@ -1175,30 +1258,33 @@ define void @frinta_v64f32(<64 x float>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @frinta_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frinta_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frinta_v1f64:
-; CHECK: frinta d0, d0
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frinta d0, d0
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.round.v1f64(<1 x double> %op)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @frinta_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frinta_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frinta_v2f64:
-; CHECK: frinta v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frinta v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.round.v2f64(<2 x double> %op)
ret <2 x double> %res
}
-define void @frinta_v4f64(<4 x double>* %a) #0 {
+define void @frinta_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frinta_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frinta [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frinta z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call <4 x double> @llvm.round.v4f64(<4 x double> %op)
store <4 x double> %res, <4 x double>* %a
@@ -1206,49 +1292,53 @@ define void @frinta_v4f64(<4 x double>* %a) #0 {
}
define void @frinta_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frinta_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frinta [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frinta [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frinta [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frinta_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frinta z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: frinta z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frinta_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frinta z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call <8 x double> @llvm.round.v8f64(<8 x double> %op)
store <8 x double> %res, <8 x double>* %a
ret void
}
-define void @frinta_v16f64(<16 x double>* %a) #0 {
+define void @frinta_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frinta_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frinta [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frinta z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call <16 x double> @llvm.round.v16f64(<16 x double> %op)
store <16 x double> %res, <16 x double>* %a
ret void
}
-define void @frinta_v32f64(<32 x double>* %a) #0 {
+define void @frinta_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frinta_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frinta [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frinta z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call <32 x double> @llvm.round.v32f64(<32 x double> %op)
store <32 x double> %res, <32 x double>* %a
@@ -1260,30 +1350,33 @@ define void @frinta_v32f64(<32 x double>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @frintn_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frintn_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintn_v4f16:
-; CHECK: frintn v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintn v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %op)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @frintn_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frintn_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintn_v8f16:
-; CHECK: frintn v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintn v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.roundeven.v8f16(<8 x half> %op)
ret <8 x half> %res
}
-define void @frintn_v16f16(<16 x half>* %a) #0 {
+define void @frintn_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintn_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintn [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintn z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call <16 x half> @llvm.roundeven.v16f16(<16 x half> %op)
store <16 x half> %res, <16 x half>* %a
@@ -1291,49 +1384,53 @@ define void @frintn_v16f16(<16 x half>* %a) #0 {
}
define void @frintn_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frintn_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintn [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frintn [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frintn [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintn_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintn z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: frintn z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintn_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintn z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call <32 x half> @llvm.roundeven.v32f16(<32 x half> %op)
store <32 x half> %res, <32 x half>* %a
ret void
}
-define void @frintn_v64f16(<64 x half>* %a) #0 {
+define void @frintn_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintn_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintn [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintn z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call <64 x half> @llvm.roundeven.v64f16(<64 x half> %op)
store <64 x half> %res, <64 x half>* %a
ret void
}
-define void @frintn_v128f16(<128 x half>* %a) #0 {
+define void @frintn_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintn_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintn [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintn z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call <128 x half> @llvm.roundeven.v128f16(<128 x half> %op)
store <128 x half> %res, <128 x half>* %a
@@ -1341,30 +1438,33 @@ define void @frintn_v128f16(<128 x half>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @frintn_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frintn_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintn_v2f32:
-; CHECK: frintn v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintn v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %op)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @frintn_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frintn_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintn_v4f32:
-; CHECK: frintn v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintn v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %op)
ret <4 x float> %res
}
-define void @frintn_v8f32(<8 x float>* %a) #0 {
+define void @frintn_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintn_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintn [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintn z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %op)
store <8 x float> %res, <8 x float>* %a
@@ -1372,49 +1472,53 @@ define void @frintn_v8f32(<8 x float>* %a) #0 {
}
define void @frintn_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frintn_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintn [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frintn [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frintn [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintn_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintn z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: frintn z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintn_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintn z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call <16 x float> @llvm.roundeven.v16f32(<16 x float> %op)
store <16 x float> %res, <16 x float>* %a
ret void
}
-define void @frintn_v32f32(<32 x float>* %a) #0 {
+define void @frintn_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintn_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintn [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintn z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call <32 x float> @llvm.roundeven.v32f32(<32 x float> %op)
store <32 x float> %res, <32 x float>* %a
ret void
}
-define void @frintn_v64f32(<64 x float>* %a) #0 {
+define void @frintn_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintn_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintn [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintn z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call <64 x float> @llvm.roundeven.v64f32(<64 x float> %op)
store <64 x float> %res, <64 x float>* %a
@@ -1422,30 +1526,33 @@ define void @frintn_v64f32(<64 x float>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @frintn_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frintn_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintn_v1f64:
-; CHECK: frintn d0, d0
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintn d0, d0
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %op)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @frintn_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frintn_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintn_v2f64:
-; CHECK: frintn v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintn v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %op)
ret <2 x double> %res
}
-define void @frintn_v4f64(<4 x double>* %a) #0 {
+define void @frintn_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintn_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintn [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintn z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %op)
store <4 x double> %res, <4 x double>* %a
@@ -1453,49 +1560,53 @@ define void @frintn_v4f64(<4 x double>* %a) #0 {
}
define void @frintn_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frintn_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintn [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frintn [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frintn [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintn_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintn z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: frintn z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintn_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintn z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %op)
store <8 x double> %res, <8 x double>* %a
ret void
}
-define void @frintn_v16f64(<16 x double>* %a) #0 {
+define void @frintn_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintn_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintn [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintn z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call <16 x double> @llvm.roundeven.v16f64(<16 x double> %op)
store <16 x double> %res, <16 x double>* %a
ret void
}
-define void @frintn_v32f64(<32 x double>* %a) #0 {
+define void @frintn_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintn_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintn [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintn z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call <32 x double> @llvm.roundeven.v32f64(<32 x double> %op)
store <32 x double> %res, <32 x double>* %a
@@ -1507,30 +1618,33 @@ define void @frintn_v32f64(<32 x double>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @frintz_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frintz_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintz_v4f16:
-; CHECK: frintz v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintz v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = call <4 x half> @llvm.trunc.v4f16(<4 x half> %op)
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @frintz_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frintz_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintz_v8f16:
-; CHECK: frintz v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintz v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = call <8 x half> @llvm.trunc.v8f16(<8 x half> %op)
ret <8 x half> %res
}
-define void @frintz_v16f16(<16 x half>* %a) #0 {
+define void @frintz_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintz_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintz z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x half>, <16 x half>* %a
%res = call <16 x half> @llvm.trunc.v16f16(<16 x half> %op)
store <16 x half> %res, <16 x half>* %a
@@ -1538,49 +1652,53 @@ define void @frintz_v16f16(<16 x half>* %a) #0 {
}
define void @frintz_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frintz_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frintz [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frintz [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintz_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintz z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: frintz z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintz_v32f16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintz z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x half>, <32 x half>* %a
%res = call <32 x half> @llvm.trunc.v32f16(<32 x half> %op)
store <32 x half> %res, <32 x half>* %a
ret void
}
-define void @frintz_v64f16(<64 x half>* %a) #0 {
+define void @frintz_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintz_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintz z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%res = call <64 x half> @llvm.trunc.v64f16(<64 x half> %op)
store <64 x half> %res, <64 x half>* %a
ret void
}
-define void @frintz_v128f16(<128 x half>* %a) #0 {
+define void @frintz_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintz_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: frintz z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%res = call <128 x half> @llvm.trunc.v128f16(<128 x half> %op)
store <128 x half> %res, <128 x half>* %a
@@ -1588,30 +1706,33 @@ define void @frintz_v128f16(<128 x half>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @frintz_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frintz_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintz_v2f32:
-; CHECK: frintz v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintz v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = call <2 x float> @llvm.trunc.v2f32(<2 x float> %op)
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @frintz_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frintz_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintz_v4f32:
-; CHECK: frintz v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintz v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %op)
ret <4 x float> %res
}
-define void @frintz_v8f32(<8 x float>* %a) #0 {
+define void @frintz_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintz_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintz z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x float>, <8 x float>* %a
%res = call <8 x float> @llvm.trunc.v8f32(<8 x float> %op)
store <8 x float> %res, <8 x float>* %a
@@ -1619,49 +1740,53 @@ define void @frintz_v8f32(<8 x float>* %a) #0 {
}
define void @frintz_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frintz_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frintz [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frintz [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintz_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintz z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: frintz z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintz_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintz z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x float>, <16 x float>* %a
%res = call <16 x float> @llvm.trunc.v16f32(<16 x float> %op)
store <16 x float> %res, <16 x float>* %a
ret void
}
-define void @frintz_v32f32(<32 x float>* %a) #0 {
+define void @frintz_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintz_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintz z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%res = call <32 x float> @llvm.trunc.v32f32(<32 x float> %op)
store <32 x float> %res, <32 x float>* %a
ret void
}
-define void @frintz_v64f32(<64 x float>* %a) #0 {
+define void @frintz_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintz_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: frintz z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%res = call <64 x float> @llvm.trunc.v64f32(<64 x float> %op)
store <64 x float> %res, <64 x float>* %a
@@ -1669,30 +1794,33 @@ define void @frintz_v64f32(<64 x float>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @frintz_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frintz_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintz_v1f64:
-; CHECK: frintz d0, d0
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintz d0, d0
+; CHECK-NEXT: ret
%res = call <1 x double> @llvm.trunc.v1f64(<1 x double> %op)
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @frintz_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frintz_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: frintz_v2f64:
-; CHECK: frintz v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintz v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = call <2 x double> @llvm.trunc.v2f64(<2 x double> %op)
ret <2 x double> %res
}
-define void @frintz_v4f64(<4 x double>* %a) #0 {
+define void @frintz_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: frintz_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintz z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x double>, <4 x double>* %a
%res = call <4 x double> @llvm.trunc.v4f64(<4 x double> %op)
store <4 x double> %res, <4 x double>* %a
@@ -1700,49 +1828,53 @@ define void @frintz_v4f64(<4 x double>* %a) #0 {
}
define void @frintz_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frintz_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frintz [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frintz [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintz_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: frintz z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: frintz z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: frintz_v8f64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: frintz z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x double>, <8 x double>* %a
%res = call <8 x double> @llvm.trunc.v8f64(<8 x double> %op)
store <8 x double> %res, <8 x double>* %a
ret void
}
-define void @frintz_v16f64(<16 x double>* %a) #0 {
+define void @frintz_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: frintz_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintz z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%res = call <16 x double> @llvm.trunc.v16f64(<16 x double> %op)
store <16 x double> %res, <16 x double>* %a
ret void
}
-define void @frintz_v32f64(<32 x double>* %a) #0 {
+define void @frintz_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: frintz_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: frintz z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%res = call <32 x double> @llvm.trunc.v32f64(<32 x double> %op)
store <32 x double> %res, <32 x double>* %a
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
index 3539fcbf28b7..6d2d4227bfd3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
@@ -1,36 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
; Don't use SVE for 64-bit vectors.
-define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v4f16:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w0, #0x1
-; NO_SVE-NEXT: csetm w8, ne
-; NO_SVE-NEXT: dup v2.4h, w8
-; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b
-; NO_SVE-NEXT: ret
-;
+define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@@ -43,15 +19,7 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v8f16:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w0, #0x1
-; NO_SVE-NEXT: csetm w8, ne
-; NO_SVE-NEXT: dup v2.8h, w8
-; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b
-; NO_SVE-NEXT: ret
-;
+define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@@ -63,21 +31,7 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) #0 {
ret <8 x half> %sel
}
-define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v16f16:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w2, #0x1
-; NO_SVE-NEXT: ldr q0, [x0]
-; NO_SVE-NEXT: csetm w8, ne
-; NO_SVE-NEXT: ldr q1, [x0, #16]
-; NO_SVE-NEXT: ldr q2, [x1]
-; NO_SVE-NEXT: ldr q3, [x1, #16]
-; NO_SVE-NEXT: dup v4.8h, w8
-; NO_SVE-NEXT: bif v0.16b, v2.16b, v4.16b
-; NO_SVE-NEXT: bif v1.16b, v3.16b, v4.16b
-; NO_SVE-NEXT: stp q0, q1, [x0]
-; NO_SVE-NEXT: ret
-;
+define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
@@ -99,26 +53,24 @@ define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) #0 {
}
define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v32f16:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w2, #0x1
-; NO_SVE-NEXT: ldr q0, [x0, #48]
-; NO_SVE-NEXT: csetm w8, ne
-; NO_SVE-NEXT: ldr q1, [x0]
-; NO_SVE-NEXT: ldr q2, [x0, #16]
-; NO_SVE-NEXT: ldr q3, [x0, #32]
-; NO_SVE-NEXT: ldr q4, [x1, #48]
-; NO_SVE-NEXT: dup v6.8h, w8
-; NO_SVE-NEXT: ldr q5, [x1]
-; NO_SVE-NEXT: ldr q7, [x1, #16]
-; NO_SVE-NEXT: ldr q16, [x1, #32]
-; NO_SVE-NEXT: bif v1.16b, v5.16b, v6.16b
-; NO_SVE-NEXT: bif v2.16b, v7.16b, v6.16b
-; NO_SVE-NEXT: bif v0.16b, v4.16b, v6.16b
-; NO_SVE-NEXT: bif v3.16b, v16.16b, v6.16b
-; NO_SVE-NEXT: stp q1, q2, [x0]
-; NO_SVE-NEXT: stp q3, q0, [x0, #32]
-; NO_SVE-NEXT: ret
+; VBITS_GE_256-LABEL: select_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: and w9, w2, #0x1
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ptrue p1.h
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov z4.h, w9
+; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1
+; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z4.h, #0
+; VBITS_GE_256-NEXT: sel z1.h, p1, z1.h, z3.h
+; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z2.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v32f16:
; VBITS_GE_512: // %bb.0:
@@ -140,58 +92,20 @@ define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v64f16:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w2, #0x1
-; NO_SVE-NEXT: ldr q0, [x0, #16]
-; NO_SVE-NEXT: csetm w8, ne
-; NO_SVE-NEXT: ldr q1, [x0]
-; NO_SVE-NEXT: ldr q2, [x0, #48]
-; NO_SVE-NEXT: ldr q3, [x0, #32]
-; NO_SVE-NEXT: ldr q4, [x0, #80]
-; NO_SVE-NEXT: dup v21.8h, w8
-; NO_SVE-NEXT: ldr q5, [x0, #64]
-; NO_SVE-NEXT: ldr q6, [x0, #112]
-; NO_SVE-NEXT: ldr q7, [x0, #96]
-; NO_SVE-NEXT: ldr q16, [x1, #16]
-; NO_SVE-NEXT: ldr q17, [x1]
-; NO_SVE-NEXT: ldr q18, [x1, #48]
-; NO_SVE-NEXT: ldr q19, [x1, #32]
-; NO_SVE-NEXT: bif v0.16b, v16.16b, v21.16b
-; NO_SVE-NEXT: ldr q20, [x1, #80]
-; NO_SVE-NEXT: bif v1.16b, v17.16b, v21.16b
-; NO_SVE-NEXT: ldr q16, [x1, #64]
-; NO_SVE-NEXT: bif v2.16b, v18.16b, v21.16b
-; NO_SVE-NEXT: ldr q17, [x1, #112]
-; NO_SVE-NEXT: bif v3.16b, v19.16b, v21.16b
-; NO_SVE-NEXT: ldr q18, [x1, #96]
-; NO_SVE-NEXT: bif v4.16b, v20.16b, v21.16b
-; NO_SVE-NEXT: stp q1, q0, [x0]
-; NO_SVE-NEXT: mov v0.16b, v21.16b
-; NO_SVE-NEXT: mov v1.16b, v21.16b
-; NO_SVE-NEXT: stp q3, q2, [x0, #32]
-; NO_SVE-NEXT: mov v2.16b, v21.16b
-; NO_SVE-NEXT: bsl v0.16b, v5.16b, v16.16b
-; NO_SVE-NEXT: bsl v1.16b, v6.16b, v17.16b
-; NO_SVE-NEXT: bsl v2.16b, v7.16b, v18.16b
-; NO_SVE-NEXT: stp q0, q4, [x0, #64]
-; NO_SVE-NEXT: stp q2, q1, [x0, #96]
-; NO_SVE-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: select_v64f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: and w8, w2, #0x1
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: ptrue p1.h
-; VBITS_GE_1024-NEXT: mov z2.h, w8
-; VBITS_GE_1024-NEXT: and z2.h, z2.h, #0x1
-; VBITS_GE_1024-NEXT: cmpne p1.h, p1/z, z2.h, #0
-; VBITS_GE_1024-NEXT: sel z0.h, p1, z0.h, z1.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: mov z2.h, w8
+; CHECK-NEXT: and z2.h, z2.h, #0x1
+; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
+; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <64 x half>, <64 x half>* %a
%op2 = load volatile <64 x half>, <64 x half>* %b
%sel = select i1 %mask, <64 x half> %op1, <64 x half> %op2
@@ -199,103 +113,20 @@ define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v128f16:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
-; NO_SVE-NEXT: .cfi_def_cfa_offset 32
-; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; NO_SVE-NEXT: .cfi_offset b8, -8
-; NO_SVE-NEXT: .cfi_offset b9, -16
-; NO_SVE-NEXT: .cfi_offset b10, -24
-; NO_SVE-NEXT: .cfi_offset b11, -32
-; NO_SVE-NEXT: tst w2, #0x1
-; NO_SVE-NEXT: ldr q0, [x0, #240]
-; NO_SVE-NEXT: csetm w8, ne
-; NO_SVE-NEXT: ldr q1, [x0, #224]
-; NO_SVE-NEXT: ldr q2, [x0, #208]
-; NO_SVE-NEXT: ldr q3, [x0, #192]
-; NO_SVE-NEXT: ldr q4, [x0, #176]
-; NO_SVE-NEXT: dup v8.8h, w8
-; NO_SVE-NEXT: ldr q5, [x0, #160]
-; NO_SVE-NEXT: ldr q6, [x0, #144]
-; NO_SVE-NEXT: ldr q7, [x0, #128]
-; NO_SVE-NEXT: ldr q16, [x0, #112]
-; NO_SVE-NEXT: ldr q17, [x0, #96]
-; NO_SVE-NEXT: ldr q18, [x0, #80]
-; NO_SVE-NEXT: ldr q19, [x0, #64]
-; NO_SVE-NEXT: ldr q20, [x0, #48]
-; NO_SVE-NEXT: ldr q21, [x0, #32]
-; NO_SVE-NEXT: ldr q22, [x0, #16]
-; NO_SVE-NEXT: ldr q23, [x0]
-; NO_SVE-NEXT: ldr q24, [x1, #240]
-; NO_SVE-NEXT: ldr q25, [x1, #224]
-; NO_SVE-NEXT: ldr q26, [x1, #208]
-; NO_SVE-NEXT: ldr q27, [x1, #192]
-; NO_SVE-NEXT: bif v0.16b, v24.16b, v8.16b
-; NO_SVE-NEXT: ldr q28, [x1, #176]
-; NO_SVE-NEXT: bif v1.16b, v25.16b, v8.16b
-; NO_SVE-NEXT: ldr q29, [x1, #160]
-; NO_SVE-NEXT: bif v2.16b, v26.16b, v8.16b
-; NO_SVE-NEXT: ldr q30, [x1, #144]
-; NO_SVE-NEXT: bif v3.16b, v27.16b, v8.16b
-; NO_SVE-NEXT: ldr q31, [x1, #128]
-; NO_SVE-NEXT: ldr q9, [x1, #112]
-; NO_SVE-NEXT: ldr q10, [x1, #96]
-; NO_SVE-NEXT: bif v4.16b, v28.16b, v8.16b
-; NO_SVE-NEXT: ldr q28, [x1, #80]
-; NO_SVE-NEXT: ldr q24, [x1, #64]
-; NO_SVE-NEXT: ldr q25, [x1, #48]
-; NO_SVE-NEXT: ldr q26, [x1, #32]
-; NO_SVE-NEXT: ldr q27, [x1, #16]
-; NO_SVE-NEXT: ldr q11, [x1]
-; NO_SVE-NEXT: stp q3, q2, [x0, #192]
-; NO_SVE-NEXT: stp q1, q0, [x0, #224]
-; NO_SVE-NEXT: mov v0.16b, v8.16b
-; NO_SVE-NEXT: mov v1.16b, v8.16b
-; NO_SVE-NEXT: mov v2.16b, v8.16b
-; NO_SVE-NEXT: bsl v0.16b, v5.16b, v29.16b
-; NO_SVE-NEXT: bsl v1.16b, v6.16b, v30.16b
-; NO_SVE-NEXT: bsl v2.16b, v7.16b, v31.16b
-; NO_SVE-NEXT: mov v3.16b, v8.16b
-; NO_SVE-NEXT: stp q0, q4, [x0, #160]
-; NO_SVE-NEXT: mov v4.16b, v8.16b
-; NO_SVE-NEXT: mov v0.16b, v8.16b
-; NO_SVE-NEXT: stp q2, q1, [x0, #128]
-; NO_SVE-NEXT: mov v1.16b, v8.16b
-; NO_SVE-NEXT: bsl v3.16b, v16.16b, v9.16b
-; NO_SVE-NEXT: bsl v4.16b, v17.16b, v10.16b
-; NO_SVE-NEXT: bsl v0.16b, v18.16b, v28.16b
-; NO_SVE-NEXT: bsl v1.16b, v19.16b, v24.16b
-; NO_SVE-NEXT: mov v2.16b, v8.16b
-; NO_SVE-NEXT: stp q4, q3, [x0, #96]
-; NO_SVE-NEXT: mov v3.16b, v8.16b
-; NO_SVE-NEXT: mov v4.16b, v8.16b
-; NO_SVE-NEXT: stp q1, q0, [x0, #64]
-; NO_SVE-NEXT: mov v0.16b, v8.16b
-; NO_SVE-NEXT: bsl v2.16b, v20.16b, v25.16b
-; NO_SVE-NEXT: bsl v3.16b, v21.16b, v26.16b
-; NO_SVE-NEXT: bsl v4.16b, v22.16b, v27.16b
-; NO_SVE-NEXT: bsl v0.16b, v23.16b, v11.16b
-; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; NO_SVE-NEXT: stp q3, q2, [x0, #32]
-; NO_SVE-NEXT: stp q0, q4, [x0]
-; NO_SVE-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
-; NO_SVE-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: select_v128f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: and w8, w2, #0x1
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: ptrue p1.h
-; VBITS_GE_2048-NEXT: mov z2.h, w8
-; VBITS_GE_2048-NEXT: and z2.h, z2.h, #0x1
-; VBITS_GE_2048-NEXT: cmpne p1.h, p1/z, z2.h, #0
-; VBITS_GE_2048-NEXT: sel z0.h, p1, z0.h, z1.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v128f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: mov z2.h, w8
+; CHECK-NEXT: and z2.h, z2.h, #0x1
+; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
+; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <128 x half>, <128 x half>* %a
%op2 = load volatile <128 x half>, <128 x half>* %b
%sel = select i1 %mask, <128 x half> %op1, <128 x half> %op2
@@ -304,15 +135,7 @@ define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v2f32:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w0, #0x1
-; NO_SVE-NEXT: csetm w8, ne
-; NO_SVE-NEXT: dup v2.2s, w8
-; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b
-; NO_SVE-NEXT: ret
-;
+define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@@ -325,15 +148,7 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) #
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v4f32:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w0, #0x1
-; NO_SVE-NEXT: csetm w8, ne
-; NO_SVE-NEXT: dup v2.4s, w8
-; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b
-; NO_SVE-NEXT: ret
-;
+define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@@ -345,21 +160,7 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) #
ret <4 x float> %sel
}
-define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v8f32:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w2, #0x1
-; NO_SVE-NEXT: ldr q0, [x0]
-; NO_SVE-NEXT: csetm w8, ne
-; NO_SVE-NEXT: ldr q1, [x0, #16]
-; NO_SVE-NEXT: ldr q2, [x1]
-; NO_SVE-NEXT: ldr q3, [x1, #16]
-; NO_SVE-NEXT: dup v4.4s, w8
-; NO_SVE-NEXT: bif v0.16b, v2.16b, v4.16b
-; NO_SVE-NEXT: bif v1.16b, v3.16b, v4.16b
-; NO_SVE-NEXT: stp q0, q1, [x0]
-; NO_SVE-NEXT: ret
-;
+define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
@@ -381,26 +182,24 @@ define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) #0 {
}
define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v16f32:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w2, #0x1
-; NO_SVE-NEXT: ldr q0, [x0, #48]
-; NO_SVE-NEXT: csetm w8, ne
-; NO_SVE-NEXT: ldr q1, [x0]
-; NO_SVE-NEXT: ldr q2, [x0, #16]
-; NO_SVE-NEXT: ldr q3, [x0, #32]
-; NO_SVE-NEXT: ldr q4, [x1, #48]
-; NO_SVE-NEXT: dup v6.4s, w8
-; NO_SVE-NEXT: ldr q5, [x1]
-; NO_SVE-NEXT: ldr q7, [x1, #16]
-; NO_SVE-NEXT: ldr q16, [x1, #32]
-; NO_SVE-NEXT: bif v1.16b, v5.16b, v6.16b
-; NO_SVE-NEXT: bif v2.16b, v7.16b, v6.16b
-; NO_SVE-NEXT: bif v0.16b, v4.16b, v6.16b
-; NO_SVE-NEXT: bif v3.16b, v16.16b, v6.16b
-; NO_SVE-NEXT: stp q1, q2, [x0]
-; NO_SVE-NEXT: stp q3, q0, [x0, #32]
-; NO_SVE-NEXT: ret
+; VBITS_GE_256-LABEL: select_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: and w9, w2, #0x1
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ptrue p1.s
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov z4.s, w9
+; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1
+; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z4.s, #0
+; VBITS_GE_256-NEXT: sel z1.s, p1, z1.s, z3.s
+; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z2.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v16f32:
; VBITS_GE_512: // %bb.0:
@@ -422,58 +221,20 @@ define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v32f32:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w2, #0x1
-; NO_SVE-NEXT: ldr q0, [x0, #16]
-; NO_SVE-NEXT: csetm w8, ne
-; NO_SVE-NEXT: ldr q1, [x0]
-; NO_SVE-NEXT: ldr q2, [x0, #48]
-; NO_SVE-NEXT: ldr q3, [x0, #32]
-; NO_SVE-NEXT: ldr q4, [x0, #80]
-; NO_SVE-NEXT: dup v21.4s, w8
-; NO_SVE-NEXT: ldr q5, [x0, #64]
-; NO_SVE-NEXT: ldr q6, [x0, #112]
-; NO_SVE-NEXT: ldr q7, [x0, #96]
-; NO_SVE-NEXT: ldr q16, [x1, #16]
-; NO_SVE-NEXT: ldr q17, [x1]
-; NO_SVE-NEXT: ldr q18, [x1, #48]
-; NO_SVE-NEXT: ldr q19, [x1, #32]
-; NO_SVE-NEXT: bif v0.16b, v16.16b, v21.16b
-; NO_SVE-NEXT: ldr q20, [x1, #80]
-; NO_SVE-NEXT: bif v1.16b, v17.16b, v21.16b
-; NO_SVE-NEXT: ldr q16, [x1, #64]
-; NO_SVE-NEXT: bif v2.16b, v18.16b, v21.16b
-; NO_SVE-NEXT: ldr q17, [x1, #112]
-; NO_SVE-NEXT: bif v3.16b, v19.16b, v21.16b
-; NO_SVE-NEXT: ldr q18, [x1, #96]
-; NO_SVE-NEXT: bif v4.16b, v20.16b, v21.16b
-; NO_SVE-NEXT: stp q1, q0, [x0]
-; NO_SVE-NEXT: mov v0.16b, v21.16b
-; NO_SVE-NEXT: mov v1.16b, v21.16b
-; NO_SVE-NEXT: stp q3, q2, [x0, #32]
-; NO_SVE-NEXT: mov v2.16b, v21.16b
-; NO_SVE-NEXT: bsl v0.16b, v5.16b, v16.16b
-; NO_SVE-NEXT: bsl v1.16b, v6.16b, v17.16b
-; NO_SVE-NEXT: bsl v2.16b, v7.16b, v18.16b
-; NO_SVE-NEXT: stp q0, q4, [x0, #64]
-; NO_SVE-NEXT: stp q2, q1, [x0, #96]
-; NO_SVE-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: select_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: and w8, w2, #0x1
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: ptrue p1.s
-; VBITS_GE_1024-NEXT: mov z2.s, w8
-; VBITS_GE_1024-NEXT: and z2.s, z2.s, #0x1
-; VBITS_GE_1024-NEXT: cmpne p1.s, p1/z, z2.s, #0
-; VBITS_GE_1024-NEXT: sel z0.s, p1, z0.s, z1.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mov z2.s, w8
+; CHECK-NEXT: and z2.s, z2.s, #0x1
+; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
+; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <32 x float>, <32 x float>* %a
%op2 = load volatile <32 x float>, <32 x float>* %b
%sel = select i1 %mask, <32 x float> %op1, <32 x float> %op2
@@ -481,103 +242,20 @@ define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v64f32:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
-; NO_SVE-NEXT: .cfi_def_cfa_offset 32
-; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; NO_SVE-NEXT: .cfi_offset b8, -8
-; NO_SVE-NEXT: .cfi_offset b9, -16
-; NO_SVE-NEXT: .cfi_offset b10, -24
-; NO_SVE-NEXT: .cfi_offset b11, -32
-; NO_SVE-NEXT: tst w2, #0x1
-; NO_SVE-NEXT: ldr q0, [x0, #240]
-; NO_SVE-NEXT: csetm w8, ne
-; NO_SVE-NEXT: ldr q1, [x0, #224]
-; NO_SVE-NEXT: ldr q2, [x0, #208]
-; NO_SVE-NEXT: ldr q3, [x0, #192]
-; NO_SVE-NEXT: ldr q4, [x0, #176]
-; NO_SVE-NEXT: dup v8.4s, w8
-; NO_SVE-NEXT: ldr q5, [x0, #160]
-; NO_SVE-NEXT: ldr q6, [x0, #144]
-; NO_SVE-NEXT: ldr q7, [x0, #128]
-; NO_SVE-NEXT: ldr q16, [x0, #112]
-; NO_SVE-NEXT: ldr q17, [x0, #96]
-; NO_SVE-NEXT: ldr q18, [x0, #80]
-; NO_SVE-NEXT: ldr q19, [x0, #64]
-; NO_SVE-NEXT: ldr q20, [x0, #48]
-; NO_SVE-NEXT: ldr q21, [x0, #32]
-; NO_SVE-NEXT: ldr q22, [x0, #16]
-; NO_SVE-NEXT: ldr q23, [x0]
-; NO_SVE-NEXT: ldr q24, [x1, #240]
-; NO_SVE-NEXT: ldr q25, [x1, #224]
-; NO_SVE-NEXT: ldr q26, [x1, #208]
-; NO_SVE-NEXT: ldr q27, [x1, #192]
-; NO_SVE-NEXT: bif v0.16b, v24.16b, v8.16b
-; NO_SVE-NEXT: ldr q28, [x1, #176]
-; NO_SVE-NEXT: bif v1.16b, v25.16b, v8.16b
-; NO_SVE-NEXT: ldr q29, [x1, #160]
-; NO_SVE-NEXT: bif v2.16b, v26.16b, v8.16b
-; NO_SVE-NEXT: ldr q30, [x1, #144]
-; NO_SVE-NEXT: bif v3.16b, v27.16b, v8.16b
-; NO_SVE-NEXT: ldr q31, [x1, #128]
-; NO_SVE-NEXT: ldr q9, [x1, #112]
-; NO_SVE-NEXT: ldr q10, [x1, #96]
-; NO_SVE-NEXT: bif v4.16b, v28.16b, v8.16b
-; NO_SVE-NEXT: ldr q28, [x1, #80]
-; NO_SVE-NEXT: ldr q24, [x1, #64]
-; NO_SVE-NEXT: ldr q25, [x1, #48]
-; NO_SVE-NEXT: ldr q26, [x1, #32]
-; NO_SVE-NEXT: ldr q27, [x1, #16]
-; NO_SVE-NEXT: ldr q11, [x1]
-; NO_SVE-NEXT: stp q3, q2, [x0, #192]
-; NO_SVE-NEXT: stp q1, q0, [x0, #224]
-; NO_SVE-NEXT: mov v0.16b, v8.16b
-; NO_SVE-NEXT: mov v1.16b, v8.16b
-; NO_SVE-NEXT: mov v2.16b, v8.16b
-; NO_SVE-NEXT: bsl v0.16b, v5.16b, v29.16b
-; NO_SVE-NEXT: bsl v1.16b, v6.16b, v30.16b
-; NO_SVE-NEXT: bsl v2.16b, v7.16b, v31.16b
-; NO_SVE-NEXT: mov v3.16b, v8.16b
-; NO_SVE-NEXT: stp q0, q4, [x0, #160]
-; NO_SVE-NEXT: mov v4.16b, v8.16b
-; NO_SVE-NEXT: mov v0.16b, v8.16b
-; NO_SVE-NEXT: stp q2, q1, [x0, #128]
-; NO_SVE-NEXT: mov v1.16b, v8.16b
-; NO_SVE-NEXT: bsl v3.16b, v16.16b, v9.16b
-; NO_SVE-NEXT: bsl v4.16b, v17.16b, v10.16b
-; NO_SVE-NEXT: bsl v0.16b, v18.16b, v28.16b
-; NO_SVE-NEXT: bsl v1.16b, v19.16b, v24.16b
-; NO_SVE-NEXT: mov v2.16b, v8.16b
-; NO_SVE-NEXT: stp q4, q3, [x0, #96]
-; NO_SVE-NEXT: mov v3.16b, v8.16b
-; NO_SVE-NEXT: mov v4.16b, v8.16b
-; NO_SVE-NEXT: stp q1, q0, [x0, #64]
-; NO_SVE-NEXT: mov v0.16b, v8.16b
-; NO_SVE-NEXT: bsl v2.16b, v20.16b, v25.16b
-; NO_SVE-NEXT: bsl v3.16b, v21.16b, v26.16b
-; NO_SVE-NEXT: bsl v4.16b, v22.16b, v27.16b
-; NO_SVE-NEXT: bsl v0.16b, v23.16b, v11.16b
-; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; NO_SVE-NEXT: stp q3, q2, [x0, #32]
-; NO_SVE-NEXT: stp q0, q4, [x0]
-; NO_SVE-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
-; NO_SVE-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: select_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: and w8, w2, #0x1
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: ptrue p1.s
-; VBITS_GE_2048-NEXT: mov z2.s, w8
-; VBITS_GE_2048-NEXT: and z2.s, z2.s, #0x1
-; VBITS_GE_2048-NEXT: cmpne p1.s, p1/z, z2.s, #0
-; VBITS_GE_2048-NEXT: sel z0.s, p1, z0.s, z1.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mov z2.s, w8
+; CHECK-NEXT: and z2.s, z2.s, #0x1
+; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
+; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <64 x float>, <64 x float>* %a
%op2 = load volatile <64 x float>, <64 x float>* %b
%sel = select i1 %mask, <64 x float> %op1, <64 x float> %op2
@@ -586,15 +264,7 @@ define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v1f64:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w0, #0x1
-; NO_SVE-NEXT: csetm x8, ne
-; NO_SVE-NEXT: fmov d2, x8
-; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b
-; NO_SVE-NEXT: ret
-;
+define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@@ -607,15 +277,7 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v2f64:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w0, #0x1
-; NO_SVE-NEXT: csetm x8, ne
-; NO_SVE-NEXT: dup v2.2d, x8
-; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b
-; NO_SVE-NEXT: ret
-;
+define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@@ -627,21 +289,7 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
ret <2 x double> %sel
}
-define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v4f64:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w2, #0x1
-; NO_SVE-NEXT: ldr q0, [x0]
-; NO_SVE-NEXT: csetm x8, ne
-; NO_SVE-NEXT: ldr q1, [x0, #16]
-; NO_SVE-NEXT: ldr q2, [x1]
-; NO_SVE-NEXT: ldr q3, [x1, #16]
-; NO_SVE-NEXT: dup v4.2d, x8
-; NO_SVE-NEXT: bif v0.16b, v2.16b, v4.16b
-; NO_SVE-NEXT: bif v1.16b, v3.16b, v4.16b
-; NO_SVE-NEXT: stp q0, q1, [x0]
-; NO_SVE-NEXT: ret
-;
+define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
@@ -663,26 +311,24 @@ define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) #0 {
}
define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v8f64:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w2, #0x1
-; NO_SVE-NEXT: ldr q0, [x0, #48]
-; NO_SVE-NEXT: csetm x8, ne
-; NO_SVE-NEXT: ldr q1, [x0]
-; NO_SVE-NEXT: ldr q2, [x0, #16]
-; NO_SVE-NEXT: ldr q3, [x0, #32]
-; NO_SVE-NEXT: ldr q4, [x1, #48]
-; NO_SVE-NEXT: dup v6.2d, x8
-; NO_SVE-NEXT: ldr q5, [x1]
-; NO_SVE-NEXT: ldr q7, [x1, #16]
-; NO_SVE-NEXT: ldr q16, [x1, #32]
-; NO_SVE-NEXT: bif v1.16b, v5.16b, v6.16b
-; NO_SVE-NEXT: bif v2.16b, v7.16b, v6.16b
-; NO_SVE-NEXT: bif v0.16b, v4.16b, v6.16b
-; NO_SVE-NEXT: bif v3.16b, v16.16b, v6.16b
-; NO_SVE-NEXT: stp q1, q2, [x0]
-; NO_SVE-NEXT: stp q3, q0, [x0, #32]
-; NO_SVE-NEXT: ret
+; VBITS_GE_256-LABEL: select_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: and w9, w2, #0x1
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ptrue p1.d
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov z4.d, x9
+; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1
+; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z4.d, #0
+; VBITS_GE_256-NEXT: sel z1.d, p1, z1.d, z3.d
+; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v8f64:
; VBITS_GE_512: // %bb.0:
@@ -704,58 +350,20 @@ define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v16f64:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: tst w2, #0x1
-; NO_SVE-NEXT: ldr q0, [x0, #16]
-; NO_SVE-NEXT: csetm x8, ne
-; NO_SVE-NEXT: ldr q1, [x0]
-; NO_SVE-NEXT: ldr q2, [x0, #48]
-; NO_SVE-NEXT: ldr q3, [x0, #32]
-; NO_SVE-NEXT: ldr q4, [x0, #80]
-; NO_SVE-NEXT: dup v21.2d, x8
-; NO_SVE-NEXT: ldr q5, [x0, #64]
-; NO_SVE-NEXT: ldr q6, [x0, #112]
-; NO_SVE-NEXT: ldr q7, [x0, #96]
-; NO_SVE-NEXT: ldr q16, [x1, #16]
-; NO_SVE-NEXT: ldr q17, [x1]
-; NO_SVE-NEXT: ldr q18, [x1, #48]
-; NO_SVE-NEXT: ldr q19, [x1, #32]
-; NO_SVE-NEXT: bif v0.16b, v16.16b, v21.16b
-; NO_SVE-NEXT: ldr q20, [x1, #80]
-; NO_SVE-NEXT: bif v1.16b, v17.16b, v21.16b
-; NO_SVE-NEXT: ldr q16, [x1, #64]
-; NO_SVE-NEXT: bif v2.16b, v18.16b, v21.16b
-; NO_SVE-NEXT: ldr q17, [x1, #112]
-; NO_SVE-NEXT: bif v3.16b, v19.16b, v21.16b
-; NO_SVE-NEXT: ldr q18, [x1, #96]
-; NO_SVE-NEXT: bif v4.16b, v20.16b, v21.16b
-; NO_SVE-NEXT: stp q1, q0, [x0]
-; NO_SVE-NEXT: mov v0.16b, v21.16b
-; NO_SVE-NEXT: mov v1.16b, v21.16b
-; NO_SVE-NEXT: stp q3, q2, [x0, #32]
-; NO_SVE-NEXT: mov v2.16b, v21.16b
-; NO_SVE-NEXT: bsl v0.16b, v5.16b, v16.16b
-; NO_SVE-NEXT: bsl v1.16b, v6.16b, v17.16b
-; NO_SVE-NEXT: bsl v2.16b, v7.16b, v18.16b
-; NO_SVE-NEXT: stp q0, q4, [x0, #64]
-; NO_SVE-NEXT: stp q2, q1, [x0, #96]
-; NO_SVE-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: select_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: and w8, w2, #0x1
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: ptrue p1.d
-; VBITS_GE_1024-NEXT: mov z2.d, x8
-; VBITS_GE_1024-NEXT: and z2.d, z2.d, #0x1
-; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z2.d, #0
-; VBITS_GE_1024-NEXT: sel z0.d, p1, z0.d, z1.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: mov z2.d, x8
+; CHECK-NEXT: and z2.d, z2.d, #0x1
+; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
+; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <16 x double>, <16 x double>* %a
%op2 = load volatile <16 x double>, <16 x double>* %b
%sel = select i1 %mask, <16 x double> %op1, <16 x double> %op2
@@ -763,103 +371,20 @@ define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v32f64:
-; NO_SVE: // %bb.0:
-; NO_SVE-NEXT: stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
-; NO_SVE-NEXT: .cfi_def_cfa_offset 32
-; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; NO_SVE-NEXT: .cfi_offset b8, -8
-; NO_SVE-NEXT: .cfi_offset b9, -16
-; NO_SVE-NEXT: .cfi_offset b10, -24
-; NO_SVE-NEXT: .cfi_offset b11, -32
-; NO_SVE-NEXT: tst w2, #0x1
-; NO_SVE-NEXT: ldr q0, [x0, #240]
-; NO_SVE-NEXT: csetm x8, ne
-; NO_SVE-NEXT: ldr q1, [x0, #224]
-; NO_SVE-NEXT: ldr q2, [x0, #208]
-; NO_SVE-NEXT: ldr q3, [x0, #192]
-; NO_SVE-NEXT: ldr q4, [x0, #176]
-; NO_SVE-NEXT: dup v8.2d, x8
-; NO_SVE-NEXT: ldr q5, [x0, #160]
-; NO_SVE-NEXT: ldr q6, [x0, #144]
-; NO_SVE-NEXT: ldr q7, [x0, #128]
-; NO_SVE-NEXT: ldr q16, [x0, #112]
-; NO_SVE-NEXT: ldr q17, [x0, #96]
-; NO_SVE-NEXT: ldr q18, [x0, #80]
-; NO_SVE-NEXT: ldr q19, [x0, #64]
-; NO_SVE-NEXT: ldr q20, [x0, #48]
-; NO_SVE-NEXT: ldr q21, [x0, #32]
-; NO_SVE-NEXT: ldr q22, [x0, #16]
-; NO_SVE-NEXT: ldr q23, [x0]
-; NO_SVE-NEXT: ldr q24, [x1, #240]
-; NO_SVE-NEXT: ldr q25, [x1, #224]
-; NO_SVE-NEXT: ldr q26, [x1, #208]
-; NO_SVE-NEXT: ldr q27, [x1, #192]
-; NO_SVE-NEXT: bif v0.16b, v24.16b, v8.16b
-; NO_SVE-NEXT: ldr q28, [x1, #176]
-; NO_SVE-NEXT: bif v1.16b, v25.16b, v8.16b
-; NO_SVE-NEXT: ldr q29, [x1, #160]
-; NO_SVE-NEXT: bif v2.16b, v26.16b, v8.16b
-; NO_SVE-NEXT: ldr q30, [x1, #144]
-; NO_SVE-NEXT: bif v3.16b, v27.16b, v8.16b
-; NO_SVE-NEXT: ldr q31, [x1, #128]
-; NO_SVE-NEXT: ldr q9, [x1, #112]
-; NO_SVE-NEXT: ldr q10, [x1, #96]
-; NO_SVE-NEXT: bif v4.16b, v28.16b, v8.16b
-; NO_SVE-NEXT: ldr q28, [x1, #80]
-; NO_SVE-NEXT: ldr q24, [x1, #64]
-; NO_SVE-NEXT: ldr q25, [x1, #48]
-; NO_SVE-NEXT: ldr q26, [x1, #32]
-; NO_SVE-NEXT: ldr q27, [x1, #16]
-; NO_SVE-NEXT: ldr q11, [x1]
-; NO_SVE-NEXT: stp q3, q2, [x0, #192]
-; NO_SVE-NEXT: stp q1, q0, [x0, #224]
-; NO_SVE-NEXT: mov v0.16b, v8.16b
-; NO_SVE-NEXT: mov v1.16b, v8.16b
-; NO_SVE-NEXT: mov v2.16b, v8.16b
-; NO_SVE-NEXT: bsl v0.16b, v5.16b, v29.16b
-; NO_SVE-NEXT: bsl v1.16b, v6.16b, v30.16b
-; NO_SVE-NEXT: bsl v2.16b, v7.16b, v31.16b
-; NO_SVE-NEXT: mov v3.16b, v8.16b
-; NO_SVE-NEXT: stp q0, q4, [x0, #160]
-; NO_SVE-NEXT: mov v4.16b, v8.16b
-; NO_SVE-NEXT: mov v0.16b, v8.16b
-; NO_SVE-NEXT: stp q2, q1, [x0, #128]
-; NO_SVE-NEXT: mov v1.16b, v8.16b
-; NO_SVE-NEXT: bsl v3.16b, v16.16b, v9.16b
-; NO_SVE-NEXT: bsl v4.16b, v17.16b, v10.16b
-; NO_SVE-NEXT: bsl v0.16b, v18.16b, v28.16b
-; NO_SVE-NEXT: bsl v1.16b, v19.16b, v24.16b
-; NO_SVE-NEXT: mov v2.16b, v8.16b
-; NO_SVE-NEXT: stp q4, q3, [x0, #96]
-; NO_SVE-NEXT: mov v3.16b, v8.16b
-; NO_SVE-NEXT: mov v4.16b, v8.16b
-; NO_SVE-NEXT: stp q1, q0, [x0, #64]
-; NO_SVE-NEXT: mov v0.16b, v8.16b
-; NO_SVE-NEXT: bsl v2.16b, v20.16b, v25.16b
-; NO_SVE-NEXT: bsl v3.16b, v21.16b, v26.16b
-; NO_SVE-NEXT: bsl v4.16b, v22.16b, v27.16b
-; NO_SVE-NEXT: bsl v0.16b, v23.16b, v11.16b
-; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; NO_SVE-NEXT: stp q3, q2, [x0, #32]
-; NO_SVE-NEXT: stp q0, q4, [x0]
-; NO_SVE-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
-; NO_SVE-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: select_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: and w8, w2, #0x1
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: ptrue p1.d
-; VBITS_GE_2048-NEXT: mov z2.d, x8
-; VBITS_GE_2048-NEXT: and z2.d, z2.d, #0x1
-; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z2.d, #0
-; VBITS_GE_2048-NEXT: sel z0.d, p1, z0.d, z1.d
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, i1 %mask) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: mov z2.d, x8
+; CHECK-NEXT: and z2.d, z2.d, #0x1
+; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
+; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <32 x double>, <32 x double>* %a
%op2 = load volatile <32 x double>, <32 x double>* %b
%sel = select i1 %mask, <32 x double> %op1, <32 x double> %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
index f4806abd06e4..dd6b1e41fb4b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
@@ -1,58 +1,46 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; FCVTZU H -> H
;
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) #0 {
+define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v4f16_v4i16:
-; CHECK: fcvtzu v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = fptoui <4 x half> %op1 to <4 x i16>
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define void @fcvtzu_v8f16_v8i16(<8 x half>* %a, <8 x i16>* %b) #0 {
+define void @fcvtzu_v8f16_v8i16(<8 x half>* %a, <8 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v8f16_v8i16:
-; CHECK: ldr q0, [x0]
-; CHECK-NEXT: fcvtzu v0.8h, v0.8h
-; CHECK-NEXT: str q0, [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: fcvtzu v0.8h, v0.8h
+; CHECK-NEXT: str q0, [x1]
+; CHECK-NEXT: ret
%op1 = load <8 x half>, <8 x half>* %a
%res = fptoui <8 x half> %op1 to <8 x i16>
store <8 x i16> %res, <8 x i16>* %b
ret void
}
-define void @fcvtzu_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) #0 {
+define void @fcvtzu_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v16f16_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: fcvtzu [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%res = fptoui <16 x half> %op1 to <16 x i16>
store <16 x i16> %res, <16 x i16>* %b
@@ -60,49 +48,53 @@ define void @fcvtzu_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) #0 {
}
define void @fcvtzu_v32f16_v32i16(<32 x half>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v32f16_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fcvtzu [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[LO]].h
-; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x1]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v32f16_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fcvtzu z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: fcvtzu z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v32f16_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fcvtzu z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%res = fptoui <32 x half> %op1 to <32 x i16>
store <32 x i16> %res, <32 x i16>* %b
ret void
}
-define void @fcvtzu_v64f16_v64i16(<64 x half>* %a, <64 x i16>* %b) #0 {
+define void @fcvtzu_v64f16_v64i16(<64 x half>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzu_v64f16_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fcvtzu [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%res = fptoui <64 x half> %op1 to <64 x i16>
store <64 x i16> %res, <64 x i16>* %b
ret void
}
-define void @fcvtzu_v128f16_v128i16(<128 x half>* %a, <128 x i16>* %b) #0 {
+define void @fcvtzu_v128f16_v128i16(<128 x half>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzu_v128f16_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%res = fptoui <128 x half> %op1 to <128 x i16>
store <128 x i16> %res, <128 x i16>* %b
@@ -114,32 +106,37 @@ define void @fcvtzu_v128f16_v128i16(<128 x half>* %a, <128 x i16>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) #0 {
+define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v2f16_v2i32:
-; CHECK: fcvtl v0.4s, v0.4h
-; CHECK-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%res = fptoui <2 x half> %op1 to <2 x i32>
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) #0 {
+define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v4f16_v4i32:
-; CHECK: fcvtzu v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = fptoui <4 x half> %op1 to <4 x i32>
ret <4 x i32> %res
}
-define void @fcvtzu_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) #0 {
+define void @fcvtzu_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v8f16_v8i32:
-; CHECK: ldr q[[OP:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].s, z[[OP]].h
-; CHECK-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG]]/m, [[UPK]].h
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <8 x half>, <8 x half>* %a
%res = fptoui <8 x half> %op1 to <8 x i32>
store <8 x i32> %res, <8 x i32>* %b
@@ -147,57 +144,62 @@ define void @fcvtzu_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) #0 {
}
define void @fcvtzu_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v16f16_v16i32:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_512-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: ld1h { [[VEC:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].s, [[VEC]].h
-; VBITS_EQ_256-DAG: ext [[VEC_HI:z[0-9]+]].b, [[VEC]].b, [[VEC]].b, #16
-; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].s, [[VEC_HI]].h
-; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].h
-; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].h
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG2]], [x1]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG2]], [x1, x[[NUMELTS]], lsl #2]
+; VBITS_GE_256-LABEL: fcvtzu_v16f16_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: fcvtzu z1.s, p0/m, z1.h
+; VBITS_GE_256-NEXT: fcvtzu z0.s, p0/m, z0.h
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v16f16_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: fcvtzu z0.s, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%res = fptoui <16 x half> %op1 to <16 x i32>
store <16 x i32> %res, <16 x i32>* %b
ret void
}
-define void @fcvtzu_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) #0 {
+define void @fcvtzu_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzu_v32f16_v32i32:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_1024-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%res = fptoui <32 x half> %op1 to <32 x i32>
store <32 x i32> %res, <32 x i32>* %b
ret void
}
-define void @fcvtzu_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) #0 {
+define void @fcvtzu_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzu_v64f16_v64i32:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%res = fptoui <64 x half> %op1 to <64 x i32>
store <64 x i32> %res, <64 x i32>* %b
@@ -209,36 +211,41 @@ define void @fcvtzu_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) #0 {
+define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v1f16_v1i64:
-; CHECK: fcvtzu x8, h0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu x8, h0
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
%res = fptoui <1 x half> %op1 to <1 x i64>
ret <1 x i64> %res
}
; v2f16 is not legal for NEON, so use SVE
-define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) #0 {
+define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v2f16_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z0.h
-; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; CHECK-NEXT: fcvtzu z0.d, [[PG]]/m, [[UPK2]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = fptoui <2 x half> %op1 to <2 x i64>
ret <2 x i64> %res
}
-define void @fcvtzu_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) #0 {
+define void @fcvtzu_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v4f16_v4i64:
-; CHECK: ldr d[[OP:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[OP]].h
-; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; CHECK-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK2]].h
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <4 x half>, <4 x half>* %a
%res = fptoui <4 x half> %op1 to <4 x i64>
store <4 x i64> %res, <4 x i64>* %b
@@ -246,61 +253,65 @@ define void @fcvtzu_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) #0 {
}
define void @fcvtzu_v8f16_v8i64(<8 x half>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v8f16_v8i64:
-; VBITS_GE_512: ldr q[[OP:[0-9]+]], [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[OP]].h
-; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; VBITS_GE_512-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK2]].h
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ldr q[[OP:[0-9]+]], [x0]
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ext v[[HI:[0-9]+]].16b, v[[LO:[0-9]+]].16b, v[[OP]].16b, #8
-; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[LO]].h
-; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h
-; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s
-; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s
-; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK2_LO]].h
-; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK2_HI]].h
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v8f16_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ldr q0, [x0]
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: fcvtzu z0.d, p0/m, z0.h
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: fcvtzu z1.d, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v8f16_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ldr q0, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: fcvtzu z0.d, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x half>, <8 x half>* %a
%res = fptoui <8 x half> %op1 to <8 x i64>
store <8 x i64> %res, <8 x i64>* %b
ret void
}
-define void @fcvtzu_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) #0 {
+define void @fcvtzu_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzu_v16f16_v16i64:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; VBITS_GE_1024-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK2]].h
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%res = fptoui <16 x half> %op1 to <16 x i64>
store <16 x i64> %res, <16 x i64>* %b
ret void
}
-define void @fcvtzu_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) #0 {
+define void @fcvtzu_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzu_v32f16_v32i64:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK]].s
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK2]].h
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%res = fptoui <32 x half> %op1 to <32 x i64>
store <32 x i64> %res, <32 x i64>* %b
@@ -312,101 +323,110 @@ define void @fcvtzu_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) #0 {
+define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v2f32_v2i16:
-; CHECK: fcvtzs v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = fptoui <2 x float> %op1 to <2 x i16>
ret <2 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) #0 {
+define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v4f32_v4i16:
-; CHECK: fcvtzu v1.4s, v0.4s
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov w9, v1.s[2]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.h[1], w8
-; CHECK-NEXT: mov w8, v1.s[3]
-; CHECK-NEXT: mov v0.h[2], w9
-; CHECK-NEXT: mov v0.h[3], w8
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu v1.4s, v0.4s
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: mov w9, v1.s[2]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: mov v0.h[1], w8
+; CHECK-NEXT: mov w8, v1.s[3]
+; CHECK-NEXT: mov v0.h[2], w9
+; CHECK-NEXT: mov v0.h[3], w8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%res = fptoui <4 x float> %op1 to <4 x i16>
ret <4 x i16> %res
}
-define <8 x i16> @fcvtzu_v8f32_v8i16(<8 x float>* %a) #0 {
+define <8 x i16> @fcvtzu_v8f32_v8i16(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v8f32_v8i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s
-; CHECK-NEXT: fcvtzu [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s
-; CHECK-NEXT: uzp1 z0.h, [[CVT]].h, [[CVT]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%res = fptoui <8 x float> %op1 to <8 x i16>
ret <8 x i16> %res
}
define void @fcvtzu_v16f32_v16i16(<16 x float>* %a, <16 x i16>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v16f32_v16i16:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_512-NEXT: fcvtzu [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl8
-; VBITS_EQ_256-DAG: fcvtzu [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].s
-; VBITS_EQ_256-DAG: fcvtzu [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h
-; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h
-; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h
-; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: st1h { [[RES]].h }, [[PG4]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v16f32_v16i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s
+; VBITS_GE_256-NEXT: fcvtzu z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: fcvtzu z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: ptrue p0.h, vl8
+; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v16f32_v16i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.s
+; VBITS_GE_512-NEXT: fcvtzu z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%res = fptoui <16 x float> %op1 to <16 x i16>
store <16 x i16> %res, <16 x i16>* %b
ret void
}
-define void @fcvtzu_v32f32_v32i16(<32 x float>* %a, <32 x i16>* %b) #0 {
+define void @fcvtzu_v32f32_v32i16(<32 x float>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzu_v32f32_v32i16:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_1024-NEXT: fcvtzu [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%res = fptoui <32 x float> %op1 to <32 x i16>
store <32 x i16> %res, <32 x i16>* %b
ret void
}
-define void @fcvtzu_v64f32_v64i16(<64 x float>* %a, <64 x i16>* %b) #0 {
+define void @fcvtzu_v64f32_v64i16(<64 x float>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzu_v64f32_v64i16:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].s
-; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%res = fptoui <64 x float> %op1 to <64 x i16>
store <64 x i16> %res, <64 x i16>* %b
@@ -418,30 +438,33 @@ define void @fcvtzu_v64f32_v64i16(<64 x float>* %a, <64 x i16>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) #0 {
+define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v2f32_v2i32:
-; CHECK: fcvtzu v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = fptoui <2 x float> %op1 to <2 x i32>
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) #0 {
+define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v4f32_v4i32:
-; CHECK: fcvtzu v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = fptoui <4 x float> %op1 to <4 x i32>
ret <4 x i32> %res
}
-define void @fcvtzu_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) #0 {
+define void @fcvtzu_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v8f32_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%res = fptoui <8 x float> %op1 to <8 x i32>
store <8 x i32> %res, <8 x i32>* %b
@@ -449,49 +472,53 @@ define void @fcvtzu_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) #0 {
}
define void @fcvtzu_v16f32_v16i32(<16 x float>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v16f32_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[LO]].s
-; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x1]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v16f32_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fcvtzu z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: fcvtzu z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v16f32_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fcvtzu z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%res = fptoui <16 x float> %op1 to <16 x i32>
store <16 x i32> %res, <16 x i32>* %b
ret void
}
-define void @fcvtzu_v32f32_v32i32(<32 x float>* %a, <32 x i32>* %b) #0 {
+define void @fcvtzu_v32f32_v32i32(<32 x float>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzu_v32f32_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%res = fptoui <32 x float> %op1 to <32 x i32>
store <32 x i32> %res, <32 x i32>* %b
ret void
}
-define void @fcvtzu_v64f32_v64i32(<64 x float>* %a, <64 x i32>* %b) #0 {
+define void @fcvtzu_v64f32_v64i32(<64 x float>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzu_v64f32_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%res = fptoui <64 x float> %op1 to <64 x i32>
store <64 x i32> %res, <64 x i32>* %b
@@ -503,33 +530,37 @@ define void @fcvtzu_v64f32_v64i32(<64 x float>* %a, <64 x i32>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) #0 {
+define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v1f32_v1i64:
-; CHECK: fcvtl v0.2d, v0.2s
-; CHECK-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: fcvtzu v0.2d, v0.2d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%res = fptoui <1 x float> %op1 to <1 x i64>
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) #0 {
+define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v2f32_v2i64:
-; CHECK: fcvtl v0.2d, v0.2s
-; CHECK-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: fcvtzu v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = fptoui <2 x float> %op1 to <2 x i64>
ret <2 x i64> %res
}
-define void @fcvtzu_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) #0 {
+define void @fcvtzu_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v4f32_v4i64:
-; CHECK: ldr q[[OP:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[OP]].s
-; CHECK-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK]].s
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <4 x float>, <4 x float>* %a
%res = fptoui <4 x float> %op1 to <4 x i64>
store <4 x i64> %res, <4 x i64>* %b
@@ -537,57 +568,62 @@ define void @fcvtzu_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) #0 {
}
define void @fcvtzu_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v8f32_v8i64:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
-; VBITS_GE_512-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG1]]/m, [[UPK]].s
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: ld1w { [[VEC:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].d, [[VEC]].s
-; VBITS_EQ_256-DAG: ext [[VEC_HI:z[0-9]+]].b, [[VEC]].b, [[VEC]].b, #16
-; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].d, [[VEC_HI]].s
-; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].s
-; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].s
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x1, x[[NUMELTS]], lsl #3]
+; VBITS_GE_256-LABEL: fcvtzu_v8f32_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: fcvtzu z1.d, p0/m, z1.s
+; VBITS_GE_256-NEXT: fcvtzu z0.d, p0/m, z0.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v8f32_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: fcvtzu z0.d, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%res = fptoui <8 x float> %op1 to <8 x i64>
store <8 x i64> %res, <8 x i64>* %b
ret void
}
-define void @fcvtzu_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) #0 {
+define void @fcvtzu_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzu_v16f32_v16i64:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
-; VBITS_GE_1024-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK]].s
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%res = fptoui <16 x float> %op1 to <16 x i64>
store <16 x i64> %res, <16 x i64>* %b
ret void
}
-define void @fcvtzu_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) #0 {
+define void @fcvtzu_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzu_v32f32_v32i64:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl32
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK]].s
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%res = fptoui <32 x float> %op1 to <32 x i64>
store <32 x i64> %res, <32 x i64>* %b
@@ -600,98 +636,110 @@ define void @fcvtzu_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) #0 {
;
; v1f64 is perfered to be widened to v4f64, so use SVE
-define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) #0 {
+define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v1f64_v1i16:
-; CHECK: ptrue [[PG:p[0-9]+]].d
-; CHECK-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG]]/m, z0.d
-; CHECK-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; CHECK-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = fptoui <1 x double> %op1 to <1 x i16>
ret <1 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) #0 {
+define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v2f64_v2i16:
-; CHECK: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: ret
%res = fptoui <2 x double> %op1 to <2 x i16>
ret <2 x i16> %res
}
-define <4 x i16> @fcvtzu_v4f64_v4i16(<4 x double>* %a) #0 {
+define <4 x i16> @fcvtzu_v4f64_v4i16(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v4f64_v4i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
-; CHECK-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; CHECK-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; CHECK-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%res = fptoui <4 x double> %op1 to <4 x i16>
ret <4 x i16> %res
}
define <8 x i16> @fcvtzu_v8f64_v8i16(<8 x double>* %a) #0 {
-; CHECK-LABEL: fcvtzu_v8f64_v8i16:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_512-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_512-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d
-; VBITS_EQ_256-DAG: fcvtzu [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d
-; VBITS_EQ_256-DAG: fcvtzu [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d
-; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s
-; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h
-; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h
-; VBITS_EQ_256-NEXT: mov v0.d[1], v[[RES_HI]].d[0]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d
+; VBITS_GE_256-NEXT: fcvtzu z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: fcvtzu z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v8f64_v8i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.d
+; VBITS_GE_512-NEXT: fcvtzu z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%res = fptoui <8 x double> %op1 to <8 x i16>
ret <8 x i16> %res
}
-define void @fcvtzu_v16f64_v16i16(<16 x double>* %a, <16 x i16>* %b) #0 {
+define void @fcvtzu_v16f64_v16i16(<16 x double>* %a, <16 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzu_v16f64_v16i16:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_1024-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16
-; VBITS_GE_1024-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptoui <16 x double> %op1 to <16 x i16>
store <16 x i16> %res, <16 x i16>* %b
ret void
}
-define void @fcvtzu_v32f64_v32i16(<32 x double>* %a, <32 x i16>* %b) #0 {
+define void @fcvtzu_v32f64_v32i16(<32 x double>* %a, <32 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzu_v32f64_v32i16:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_2048-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32
-; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptoui <32 x double> %op1 to <32 x i16>
store <32 x i16> %res, <32 x i16>* %b
@@ -703,96 +751,105 @@ define void @fcvtzu_v32f64_v32i16(<32 x double>* %a, <32 x i16>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) #0 {
+define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v1f64_v1i32:
-; CHECK: fcvtzu v0.2d, v0.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fcvtzu v0.2d, v0.2d
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: ret
%res = fptoui <1 x double> %op1 to <1 x i32>
ret <1 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) #0 {
+define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v2f64_v2i32:
-; CHECK: fcvtzu v0.2d, v0.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu v0.2d, v0.2d
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: ret
%res = fptoui <2 x double> %op1 to <2 x i32>
ret <2 x i32> %res
}
-define <4 x i32> @fcvtzu_v4f64_v4i32(<4 x double>* %a) #0 {
+define <4 x i32> @fcvtzu_v4f64_v4i32(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v4f64_v4i32:
-; CHECK: ptrue [[PG1:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
-; CHECK-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; CHECK-NEXT: uzp1 z0.s, [[CVT]].s, [[CVT]].s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%res = fptoui <4 x double> %op1 to <4 x i32>
ret <4 x i32> %res
}
define void @fcvtzu_v8f64_v8i32(<8 x double>* %a, <8 x i32>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v8f64_v8i32:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_512-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].s, vl8
-; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl4
-; VBITS_EQ_256-DAG: fcvtzu [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d
-; VBITS_EQ_256-DAG: fcvtzu [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d
-; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s
-; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s
-; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: st1w { [[RES]].s }, [[PG4]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d
+; VBITS_GE_256-NEXT: fcvtzu z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: fcvtzu z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: ptrue p0.s, vl4
+; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v8f64_v8i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.d
+; VBITS_GE_512-NEXT: fcvtzu z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%res = fptoui <8 x double> %op1 to <8 x i32>
store <8 x i32> %res, <8 x i32>* %b
ret void
}
-define void @fcvtzu_v16f64_v16i32(<16 x double>* %a, <16 x i32>* %b) #0 {
+define void @fcvtzu_v16f64_v16i32(<16 x double>* %a, <16 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzu_v16f64_v16i32:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_1024-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].s, vl16
-; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptoui <16 x double> %op1 to <16 x i32>
store <16 x i32> %res, <16 x i32>* %b
ret void
}
-define void @fcvtzu_v32f64_v32i32(<32 x double>* %a, <32 x i32>* %b) #0 {
+define void @fcvtzu_v32f64_v32i32(<32 x double>* %a, <32 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzu_v32f64_v32i32:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_2048-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].s, vl32
-; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptoui <32 x double> %op1 to <32 x i32>
store <32 x i32> %res, <32 x i32>* %b
@@ -804,31 +861,34 @@ define void @fcvtzu_v32f64_v32i32(<32 x double>* %a, <32 x i32>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) #0 {
+define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v1f64_v1i64:
-; CHECK: fcvtzu x8, d0
-; CHECK: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu x8, d0
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
%res = fptoui <1 x double> %op1 to <1 x i64>
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) #0 {
+define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v2f64_v2i64:
-; CHECK: fcvtzu v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = fptoui <2 x double> %op1 to <2 x i64>
ret <2 x i64> %res
}
-define void @fcvtzu_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) #0 {
+define void @fcvtzu_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzu_v4f64_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%res = fptoui <4 x double> %op1 to <4 x i64>
store <4 x i64> %res, <4 x i64>* %b
@@ -836,49 +896,53 @@ define void @fcvtzu_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) #0 {
}
define void @fcvtzu_v8f64_v8i64(<8 x double>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v8f64_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[LO]].d
-; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x1]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fcvtzu z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: fcvtzu z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v8f64_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fcvtzu z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%res = fptoui <8 x double> %op1 to <8 x i64>
store <8 x i64> %res, <8 x i64>* %b
ret void
}
-define void @fcvtzu_v16f64_v16i64(<16 x double>* %a, <16 x i64>* %b) #0 {
+define void @fcvtzu_v16f64_v16i64(<16 x double>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzu_v16f64_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptoui <16 x double> %op1 to <16 x i64>
store <16 x i64> %res, <16 x i64>* %b
ret void
}
-define void @fcvtzu_v32f64_v32i64(<32 x double>* %a, <32 x i64>* %b) #0 {
+define void @fcvtzu_v32f64_v32i64(<32 x double>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzu_v32f64_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptoui <32 x double> %op1 to <32 x i64>
store <32 x i64> %res, <32 x i64>* %b
@@ -890,34 +954,37 @@ define void @fcvtzu_v32f64_v32i64(<32 x double>* %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) #0 {
+define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v4f16_v4i16:
-; CHECK: fcvtzs v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = fptosi <4 x half> %op1 to <4 x i16>
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define void @fcvtzs_v8f16_v8i16(<8 x half>* %a, <8 x i16>* %b) #0 {
+define void @fcvtzs_v8f16_v8i16(<8 x half>* %a, <8 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v8f16_v8i16:
-; CHECK: ldr q0, [x0]
-; CHECK-NEXT: fcvtzs v0.8h, v0.8h
-; CHECK-NEXT: str q0, [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: fcvtzs v0.8h, v0.8h
+; CHECK-NEXT: str q0, [x1]
+; CHECK-NEXT: ret
%op1 = load <8 x half>, <8 x half>* %a
%res = fptosi <8 x half> %op1 to <8 x i16>
store <8 x i16> %res, <8 x i16>* %b
ret void
}
-define void @fcvtzs_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) #0 {
+define void @fcvtzs_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v16f16_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: fcvtzs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%res = fptosi <16 x half> %op1 to <16 x i16>
store <16 x i16> %res, <16 x i16>* %b
@@ -925,49 +992,53 @@ define void @fcvtzs_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) #0 {
}
define void @fcvtzs_v32f16_v32i16(<32 x half>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v32f16_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fcvtzs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[LO]].h
-; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x1]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v32f16_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fcvtzs z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: fcvtzs z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v32f16_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fcvtzs z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%res = fptosi <32 x half> %op1 to <32 x i16>
store <32 x i16> %res, <32 x i16>* %b
ret void
}
-define void @fcvtzs_v64f16_v64i16(<64 x half>* %a, <64 x i16>* %b) #0 {
+define void @fcvtzs_v64f16_v64i16(<64 x half>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzs_v64f16_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fcvtzs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%res = fptosi <64 x half> %op1 to <64 x i16>
store <64 x i16> %res, <64 x i16>* %b
ret void
}
-define void @fcvtzs_v128f16_v128i16(<128 x half>* %a, <128 x i16>* %b) #0 {
+define void @fcvtzs_v128f16_v128i16(<128 x half>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzs_v128f16_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%res = fptosi <128 x half> %op1 to <128 x i16>
store <128 x i16> %res, <128 x i16>* %b
@@ -979,32 +1050,37 @@ define void @fcvtzs_v128f16_v128i16(<128 x half>* %a, <128 x i16>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) #0 {
+define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v2f16_v2i32:
-; CHECK: fcvtl v0.4s, v0.4h
-; CHECK-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%res = fptosi <2 x half> %op1 to <2 x i32>
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) #0 {
+define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v4f16_v4i32:
-; CHECK: fcvtzs v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = fptosi <4 x half> %op1 to <4 x i32>
ret <4 x i32> %res
}
-define void @fcvtzs_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) #0 {
+define void @fcvtzs_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v8f16_v8i32:
-; CHECK: ldr q[[OP:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].s, z[[OP]].h
-; CHECK-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG]]/m, [[UPK]].h
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <8 x half>, <8 x half>* %a
%res = fptosi <8 x half> %op1 to <8 x i32>
store <8 x i32> %res, <8 x i32>* %b
@@ -1012,57 +1088,62 @@ define void @fcvtzs_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) #0 {
}
define void @fcvtzs_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v16f16_v16i32:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_512-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: ld1h { [[VEC:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].s, [[VEC]].h
-; VBITS_EQ_256-DAG: ext [[VEC_HI:z[0-9]+]].b, [[VEC]].b, [[VEC]].b, #16
-; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].s, [[VEC_HI]].h
-; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].h
-; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].h
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG2]], [x1]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG2]], [x1, x[[NUMELTS]], lsl #2]
+; VBITS_GE_256-LABEL: fcvtzs_v16f16_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: fcvtzs z1.s, p0/m, z1.h
+; VBITS_GE_256-NEXT: fcvtzs z0.s, p0/m, z0.h
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v16f16_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: fcvtzs z0.s, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%res = fptosi <16 x half> %op1 to <16 x i32>
store <16 x i32> %res, <16 x i32>* %b
ret void
}
-define void @fcvtzs_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) #0 {
+define void @fcvtzs_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzs_v32f16_v32i32:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_1024-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%res = fptosi <32 x half> %op1 to <32 x i32>
store <32 x i32> %res, <32 x i32>* %b
ret void
}
-define void @fcvtzs_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) #0 {
+define void @fcvtzs_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzs_v64f16_v64i32:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%res = fptosi <64 x half> %op1 to <64 x i32>
store <64 x i32> %res, <64 x i32>* %b
@@ -1074,36 +1155,41 @@ define void @fcvtzs_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) #0 {
+define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v1f16_v1i64:
-; CHECK: fcvtzs x8, h0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs x8, h0
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
%res = fptosi <1 x half> %op1 to <1 x i64>
ret <1 x i64> %res
}
; v2f16 is not legal for NEON, so use SVE
-define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) #0 {
+define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v2f16_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z0.h
-; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; CHECK-NEXT: fcvtzs z0.d, [[PG]]/m, [[UPK2]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = fptosi <2 x half> %op1 to <2 x i64>
ret <2 x i64> %res
}
-define void @fcvtzs_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) #0 {
+define void @fcvtzs_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v4f16_v4i64:
-; CHECK: ldr d[[OP:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[OP]].h
-; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; CHECK-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK2]].h
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <4 x half>, <4 x half>* %a
%res = fptosi <4 x half> %op1 to <4 x i64>
store <4 x i64> %res, <4 x i64>* %b
@@ -1111,61 +1197,65 @@ define void @fcvtzs_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) #0 {
}
define void @fcvtzs_v8f16_v8i64(<8 x half>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v8f16_v8i64:
-; VBITS_GE_512: ldr q[[OP:[0-9]+]], [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[OP]].h
-; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; VBITS_GE_512-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK2]].h
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ldr q[[OP:[0-9]+]], [x0]
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ext v[[HI:[0-9]+]].16b, v[[LO:[0-9]+]].16b, v[[OP]].16b, #8
-; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[LO]].h
-; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h
-; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s
-; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s
-; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK2_LO]].h
-; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK2_HI]].h
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v8f16_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ldr q0, [x0]
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: fcvtzs z0.d, p0/m, z0.h
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: fcvtzs z1.d, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v8f16_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ldr q0, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: fcvtzs z0.d, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x half>, <8 x half>* %a
%res = fptosi <8 x half> %op1 to <8 x i64>
store <8 x i64> %res, <8 x i64>* %b
ret void
}
-define void @fcvtzs_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) #0 {
+define void @fcvtzs_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzs_v16f16_v16i64:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; VBITS_GE_1024-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK2]].h
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%res = fptosi <16 x half> %op1 to <16 x i64>
store <16 x i64> %res, <16 x i64>* %b
ret void
}
-define void @fcvtzs_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) #0 {
+define void @fcvtzs_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzs_v32f16_v32i64:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK]].s
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK2]].h
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%res = fptosi <32 x half> %op1 to <32 x i64>
store <32 x i64> %res, <32 x i64>* %b
@@ -1177,101 +1267,110 @@ define void @fcvtzs_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) #0 {
+define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v2f32_v2i16:
-; CHECK: fcvtzs v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = fptosi <2 x float> %op1 to <2 x i16>
ret <2 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) #0 {
+define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v4f32_v4i16:
-; CHECK: fcvtzs v1.4s, v0.4s
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov w9, v1.s[2]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.h[1], w8
-; CHECK-NEXT: mov w8, v1.s[3]
-; CHECK-NEXT: mov v0.h[2], w9
-; CHECK-NEXT: mov v0.h[3], w8
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs v1.4s, v0.4s
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: mov w9, v1.s[2]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: mov v0.h[1], w8
+; CHECK-NEXT: mov w8, v1.s[3]
+; CHECK-NEXT: mov v0.h[2], w9
+; CHECK-NEXT: mov v0.h[3], w8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%res = fptosi <4 x float> %op1 to <4 x i16>
ret <4 x i16> %res
}
-define <8 x i16> @fcvtzs_v8f32_v8i16(<8 x float>* %a) #0 {
+define <8 x i16> @fcvtzs_v8f32_v8i16(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v8f32_v8i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s
-; CHECK-NEXT: fcvtzs [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s
-; CHECK-NEXT: uzp1 z0.h, [[CVT]].h, [[CVT]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%res = fptosi <8 x float> %op1 to <8 x i16>
ret <8 x i16> %res
}
define void @fcvtzs_v16f32_v16i16(<16 x float>* %a, <16 x i16>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v16f32_v16i16:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_512-NEXT: fcvtzs [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl8
-; VBITS_EQ_256-DAG: fcvtzs [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].s
-; VBITS_EQ_256-DAG: fcvtzs [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h
-; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h
-; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h
-; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: st1h { [[RES]].h }, [[PG4]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v16f32_v16i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s
+; VBITS_GE_256-NEXT: fcvtzs z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: fcvtzs z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: ptrue p0.h, vl8
+; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v16f32_v16i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.s
+; VBITS_GE_512-NEXT: fcvtzs z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%res = fptosi <16 x float> %op1 to <16 x i16>
store <16 x i16> %res, <16 x i16>* %b
ret void
}
-define void @fcvtzs_v32f32_v32i16(<32 x float>* %a, <32 x i16>* %b) #0 {
+define void @fcvtzs_v32f32_v32i16(<32 x float>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzs_v32f32_v32i16:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_1024-NEXT: fcvtzs [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%res = fptosi <32 x float> %op1 to <32 x i16>
store <32 x i16> %res, <32 x i16>* %b
ret void
}
-define void @fcvtzs_v64f32_v64i16(<64 x float>* %a, <64 x i16>* %b) #0 {
+define void @fcvtzs_v64f32_v64i16(<64 x float>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzs_v64f32_v64i16:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].s
-; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%res = fptosi <64 x float> %op1 to <64 x i16>
store <64 x i16> %res, <64 x i16>* %b
@@ -1283,30 +1382,33 @@ define void @fcvtzs_v64f32_v64i16(<64 x float>* %a, <64 x i16>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) #0 {
+define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v2f32_v2i32:
-; CHECK: fcvtzs v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = fptosi <2 x float> %op1 to <2 x i32>
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) #0 {
+define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v4f32_v4i32:
-; CHECK: fcvtzs v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = fptosi <4 x float> %op1 to <4 x i32>
ret <4 x i32> %res
}
-define void @fcvtzs_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) #0 {
+define void @fcvtzs_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v8f32_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%res = fptosi <8 x float> %op1 to <8 x i32>
store <8 x i32> %res, <8 x i32>* %b
@@ -1314,49 +1416,53 @@ define void @fcvtzs_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) #0 {
}
define void @fcvtzs_v16f32_v16i32(<16 x float>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v16f32_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[LO]].s
-; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x1]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v16f32_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fcvtzs z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: fcvtzs z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v16f32_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fcvtzs z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%res = fptosi <16 x float> %op1 to <16 x i32>
store <16 x i32> %res, <16 x i32>* %b
ret void
}
-define void @fcvtzs_v32f32_v32i32(<32 x float>* %a, <32 x i32>* %b) #0 {
+define void @fcvtzs_v32f32_v32i32(<32 x float>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzs_v32f32_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%res = fptosi <32 x float> %op1 to <32 x i32>
store <32 x i32> %res, <32 x i32>* %b
ret void
}
-define void @fcvtzs_v64f32_v64i32(<64 x float>* %a, <64 x i32>* %b) #0 {
+define void @fcvtzs_v64f32_v64i32(<64 x float>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzs_v64f32_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%res = fptosi <64 x float> %op1 to <64 x i32>
store <64 x i32> %res, <64 x i32>* %b
@@ -1368,33 +1474,37 @@ define void @fcvtzs_v64f32_v64i32(<64 x float>* %a, <64 x i32>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) #0 {
+define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v1f32_v1i64:
-; CHECK: fcvtl v0.2d, v0.2s
-; CHECK-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%res = fptosi <1 x float> %op1 to <1 x i64>
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) #0 {
+define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v2f32_v2i64:
-; CHECK: fcvtl v0.2d, v0.2s
-; CHECK-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = fptosi <2 x float> %op1 to <2 x i64>
ret <2 x i64> %res
}
-define void @fcvtzs_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) #0 {
+define void @fcvtzs_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v4f32_v4i64:
-; CHECK: ldr q[[OP:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[OP]].s
-; CHECK-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK]].s
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <4 x float>, <4 x float>* %a
%res = fptosi <4 x float> %op1 to <4 x i64>
store <4 x i64> %res, <4 x i64>* %b
@@ -1402,57 +1512,62 @@ define void @fcvtzs_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) #0 {
}
define void @fcvtzs_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v8f32_v8i64:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
-; VBITS_GE_512-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG1]]/m, [[UPK]].s
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: ld1w { [[VEC:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].d, [[VEC]].s
-; VBITS_EQ_256-DAG: ext [[VEC_HI:z[0-9]+]].b, [[VEC]].b, [[VEC]].b, #16
-; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].d, [[VEC]].s
-; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].s
-; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].s
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x1, x[[NUMELTS]], lsl #3]
+; VBITS_GE_256-LABEL: fcvtzs_v8f32_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: fcvtzs z1.d, p0/m, z1.s
+; VBITS_GE_256-NEXT: fcvtzs z0.d, p0/m, z0.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v8f32_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: fcvtzs z0.d, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%res = fptosi <8 x float> %op1 to <8 x i64>
store <8 x i64> %res, <8 x i64>* %b
ret void
}
-define void @fcvtzs_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) #0 {
+define void @fcvtzs_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzs_v16f32_v16i64:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
-; VBITS_GE_1024-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK]].s
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%res = fptosi <16 x float> %op1 to <16 x i64>
store <16 x i64> %res, <16 x i64>* %b
ret void
}
-define void @fcvtzs_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) #0 {
+define void @fcvtzs_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzs_v32f32_v32i64:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl32
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK]].s
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%res = fptosi <32 x float> %op1 to <32 x i64>
store <32 x i64> %res, <32 x i64>* %b
@@ -1465,98 +1580,110 @@ define void @fcvtzs_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) #0 {
;
; v1f64 is perfered to be widened to v4f64, so use SVE
-define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) #0 {
+define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v1f64_v1i16:
-; CHECK: ptrue [[PG:p[0-9]+]].d
-; CHECK-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG]]/m, z0.d
-; CHECK-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; CHECK-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = fptosi <1 x double> %op1 to <1 x i16>
ret <1 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) #0 {
+define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v2f64_v2i16:
-; CHECK: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: ret
%res = fptosi <2 x double> %op1 to <2 x i16>
ret <2 x i16> %res
}
-define <4 x i16> @fcvtzs_v4f64_v4i16(<4 x double>* %a) #0 {
+define <4 x i16> @fcvtzs_v4f64_v4i16(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v4f64_v4i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
-; CHECK-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; CHECK-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; CHECK-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%res = fptosi <4 x double> %op1 to <4 x i16>
ret <4 x i16> %res
}
define <8 x i16> @fcvtzs_v8f64_v8i16(<8 x double>* %a) #0 {
-; CHECK-LABEL: fcvtzs_v8f64_v8i16:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_512-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_512-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d
-; VBITS_EQ_256-DAG: fcvtzs [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d
-; VBITS_EQ_256-DAG: fcvtzs [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d
-; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s
-; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h
-; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h
-; VBITS_EQ_256-NEXT: mov v0.d[1], v[[RES_HI]].d[0]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d
+; VBITS_GE_256-NEXT: fcvtzs z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: fcvtzs z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v8f64_v8i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.d
+; VBITS_GE_512-NEXT: fcvtzs z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%res = fptosi <8 x double> %op1 to <8 x i16>
ret <8 x i16> %res
}
-define void @fcvtzs_v16f64_v16i16(<16 x double>* %a, <16 x i16>* %b) #0 {
+define void @fcvtzs_v16f64_v16i16(<16 x double>* %a, <16 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzs_v16f64_v16i16:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_1024-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16
-; VBITS_GE_1024-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptosi <16 x double> %op1 to <16 x i16>
store <16 x i16> %res, <16 x i16>* %b
ret void
}
-define void @fcvtzs_v32f64_v32i16(<32 x double>* %a, <32 x i16>* %b) #0 {
+define void @fcvtzs_v32f64_v32i16(<32 x double>* %a, <32 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzs_v32f64_v32i16:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_2048-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32
-; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptosi <32 x double> %op1 to <32 x i16>
store <32 x i16> %res, <32 x i16>* %b
@@ -1568,96 +1695,105 @@ define void @fcvtzs_v32f64_v32i16(<32 x double>* %a, <32 x i16>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) #0 {
+define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v1f64_v1i32:
-; CHECK: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: ret
%res = fptosi <1 x double> %op1 to <1 x i32>
ret <1 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) #0 {
+define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v2f64_v2i32:
-; CHECK: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: ret
%res = fptosi <2 x double> %op1 to <2 x i32>
ret <2 x i32> %res
}
-define <4 x i32> @fcvtzs_v4f64_v4i32(<4 x double>* %a) #0 {
+define <4 x i32> @fcvtzs_v4f64_v4i32(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v4f64_v4i32:
-; CHECK: ptrue [[PG1:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
-; CHECK-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; CHECK-NEXT: uzp1 z0.s, [[CVT]].s, [[CVT]].s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%res = fptosi <4 x double> %op1 to <4 x i32>
ret <4 x i32> %res
}
define void @fcvtzs_v8f64_v8i32(<8 x double>* %a, <8 x i32>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v8f64_v8i32:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_512-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].s, vl8
-; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl4
-; VBITS_EQ_256-DAG: fcvtzs [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d
-; VBITS_EQ_256-DAG: fcvtzs [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d
-; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s
-; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s
-; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: st1w { [[RES]].s }, [[PG4]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d
+; VBITS_GE_256-NEXT: fcvtzs z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: fcvtzs z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: ptrue p0.s, vl4
+; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v8f64_v8i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.d
+; VBITS_GE_512-NEXT: fcvtzs z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%res = fptosi <8 x double> %op1 to <8 x i32>
store <8 x i32> %res, <8 x i32>* %b
ret void
}
-define void @fcvtzs_v16f64_v16i32(<16 x double>* %a, <16 x i32>* %b) #0 {
+define void @fcvtzs_v16f64_v16i32(<16 x double>* %a, <16 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzs_v16f64_v16i32:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_1024-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].s, vl16
-; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptosi <16 x double> %op1 to <16 x i32>
store <16 x i32> %res, <16 x i32>* %b
ret void
}
-define void @fcvtzs_v32f64_v32i32(<32 x double>* %a, <32 x i32>* %b) #0 {
+define void @fcvtzs_v32f64_v32i32(<32 x double>* %a, <32 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzs_v32f64_v32i32:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_2048-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].s, vl32
-; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptosi <32 x double> %op1 to <32 x i32>
store <32 x i32> %res, <32 x i32>* %b
@@ -1669,31 +1805,34 @@ define void @fcvtzs_v32f64_v32i32(<32 x double>* %a, <32 x i32>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) #0 {
+define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v1f64_v1i64:
-; CHECK: fcvtzs x8, d0
-; CHECK: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs x8, d0
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
%res = fptosi <1 x double> %op1 to <1 x i64>
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) #0 {
+define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v2f64_v2i64:
-; CHECK: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = fptosi <2 x double> %op1 to <2 x i64>
ret <2 x i64> %res
}
-define void @fcvtzs_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) #0 {
+define void @fcvtzs_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvtzs_v4f64_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%res = fptosi <4 x double> %op1 to <4 x i64>
store <4 x i64> %res, <4 x i64>* %b
@@ -1701,49 +1840,53 @@ define void @fcvtzs_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) #0 {
}
define void @fcvtzs_v8f64_v8i64(<8 x double>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v8f64_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[LO]].d
-; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x1]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: fcvtzs z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: fcvtzs z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v8f64_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: fcvtzs z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%res = fptosi <8 x double> %op1 to <8 x i64>
store <8 x i64> %res, <8 x i64>* %b
ret void
}
-define void @fcvtzs_v16f64_v16i64(<16 x double>* %a, <16 x i64>* %b) #0 {
+define void @fcvtzs_v16f64_v16i64(<16 x double>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvtzs_v16f64_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptosi <16 x double> %op1 to <16 x i64>
store <16 x i64> %res, <16 x i64>* %b
ret void
}
-define void @fcvtzs_v32f64_v32i64(<32 x double>* %a, <32 x i64>* %b) #0 {
+define void @fcvtzs_v32f64_v32i64(<32 x double>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvtzs_v32f64_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptosi <32 x double> %op1 to <32 x i64>
store <32 x i64> %res, <32 x i64>* %b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
index 6d16273f98cd..73e6693bb2ff 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
@@ -1,26 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE for 64-bit vectors.
-define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) #0 {
+define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.4h, v2.4h, #15
@@ -32,7 +18,7 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) #0 {
+define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
@@ -44,7 +30,7 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask
ret <8 x half> %sel
}
-define void @select_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @select_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -96,44 +82,16 @@ define void @select_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
ret void
}
-define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: select_v64f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: mov x10, #32
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h
-; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z0.h, z6.h
-; VBITS_GE_256-NEXT: fcmeq p4.h, p0/z, z3.h, z7.h
-; VBITS_GE_256-NEXT: sel z0.h, p3, z0.h, z6.h
-; VBITS_GE_256-NEXT: sel z1.h, p2, z1.h, z4.h
-; VBITS_GE_256-NEXT: sel z2.h, p1, z2.h, z5.h
-; VBITS_GE_256-NEXT: sel z3.h, p4, z3.h, z7.h
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: select_v64f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
-; VBITS_GE_1024-NEXT: sel z0.h, p1, z0.h, z1.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%mask = fcmp oeq <64 x half> %op1, %op2
@@ -142,68 +100,16 @@ define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
ret void
}
-define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: select_v128f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: mov x10, #32
-; VBITS_GE_256-NEXT: mov x11, #80
-; VBITS_GE_256-NEXT: mov x12, #64
-; VBITS_GE_256-NEXT: mov x13, #112
-; VBITS_GE_256-NEXT: mov x14, #96
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h
-; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z4.h, z19.h
-; VBITS_GE_256-NEXT: fcmeq p4.h, p0/z, z3.h, z18.h
-; VBITS_GE_256-NEXT: fcmeq p5.h, p0/z, z2.h, z21.h
-; VBITS_GE_256-NEXT: fcmeq p6.h, p0/z, z1.h, z20.h
-; VBITS_GE_256-NEXT: fcmeq p7.h, p0/z, z0.h, z22.h
-; VBITS_GE_256-NEXT: fcmeq p8.h, p0/z, z7.h, z23.h
-; VBITS_GE_256-NEXT: sel z0.h, p7, z0.h, z22.h
-; VBITS_GE_256-NEXT: sel z1.h, p6, z1.h, z20.h
-; VBITS_GE_256-NEXT: sel z2.h, p5, z2.h, z21.h
-; VBITS_GE_256-NEXT: sel z3.h, p4, z3.h, z18.h
-; VBITS_GE_256-NEXT: sel z4.h, p3, z4.h, z19.h
-; VBITS_GE_256-NEXT: sel z5.h, p2, z5.h, z16.h
-; VBITS_GE_256-NEXT: sel z6.h, p1, z6.h, z17.h
-; VBITS_GE_256-NEXT: sel z7.h, p8, z7.h, z23.h
-; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: select_v128f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
-; VBITS_GE_2048-NEXT: sel z0.h, p1, z0.h, z1.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v128f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%mask = fcmp oeq <128 x half> %op1, %op2
@@ -213,7 +119,7 @@ define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) #0 {
+define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.2s, v2.2s, #31
@@ -225,7 +131,7 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) #0 {
+define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.4s, v2.4h, #0
@@ -237,7 +143,7 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m
ret <4 x float> %sel
}
-define void @select_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @select_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -289,44 +195,16 @@ define void @select_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: select_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z0.s, z6.s
-; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z7.s
-; VBITS_GE_256-NEXT: sel z0.s, p3, z0.s, z6.s
-; VBITS_GE_256-NEXT: sel z1.s, p2, z1.s, z4.s
-; VBITS_GE_256-NEXT: sel z2.s, p1, z2.s, z5.s
-; VBITS_GE_256-NEXT: sel z3.s, p4, z3.s, z7.s
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: select_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_1024-NEXT: sel z0.s, p1, z0.s, z1.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%mask = fcmp oeq <32 x float> %op1, %op2
@@ -335,68 +213,16 @@ define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
ret void
}
-define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: select_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x11, #40
-; VBITS_GE_256-NEXT: mov x12, #32
-; VBITS_GE_256-NEXT: mov x13, #56
-; VBITS_GE_256-NEXT: mov x14, #48
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z4.s, z19.s
-; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z18.s
-; VBITS_GE_256-NEXT: fcmeq p5.s, p0/z, z2.s, z21.s
-; VBITS_GE_256-NEXT: fcmeq p6.s, p0/z, z1.s, z20.s
-; VBITS_GE_256-NEXT: fcmeq p7.s, p0/z, z0.s, z22.s
-; VBITS_GE_256-NEXT: fcmeq p8.s, p0/z, z7.s, z23.s
-; VBITS_GE_256-NEXT: sel z0.s, p7, z0.s, z22.s
-; VBITS_GE_256-NEXT: sel z1.s, p6, z1.s, z20.s
-; VBITS_GE_256-NEXT: sel z2.s, p5, z2.s, z21.s
-; VBITS_GE_256-NEXT: sel z3.s, p4, z3.s, z18.s
-; VBITS_GE_256-NEXT: sel z4.s, p3, z4.s, z19.s
-; VBITS_GE_256-NEXT: sel z5.s, p2, z5.s, z16.s
-; VBITS_GE_256-NEXT: sel z6.s, p1, z6.s, z17.s
-; VBITS_GE_256-NEXT: sel z7.s, p8, z7.s, z23.s
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: select_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_2048-NEXT: sel z0.s, p1, z0.s, z1.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%mask = fcmp oeq <64 x float> %op1, %op2
@@ -406,7 +232,7 @@ define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) #0 {
+define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@@ -419,7 +245,7 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1>
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) #0 {
+define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.2d, v2.2s, #0
@@ -431,7 +257,7 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1>
ret <2 x double> %sel
}
-define void @select_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @select_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -483,44 +309,16 @@ define void @select_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: select_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d
-; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z0.d, z6.d
-; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, z7.d
-; VBITS_GE_256-NEXT: sel z0.d, p3, z0.d, z6.d
-; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z4.d
-; VBITS_GE_256-NEXT: sel z2.d, p1, z2.d, z5.d
-; VBITS_GE_256-NEXT: sel z3.d, p4, z3.d, z7.d
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: select_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
-; VBITS_GE_1024-NEXT: sel z0.d, p1, z0.d, z1.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%mask = fcmp oeq <16 x double> %op1, %op2
@@ -529,68 +327,16 @@ define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
ret void
}
-define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: select_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: mov x12, #16
-; VBITS_GE_256-NEXT: mov x13, #28
-; VBITS_GE_256-NEXT: mov x14, #24
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d
-; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z4.d, z19.d
-; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, z18.d
-; VBITS_GE_256-NEXT: fcmeq p5.d, p0/z, z2.d, z21.d
-; VBITS_GE_256-NEXT: fcmeq p6.d, p0/z, z1.d, z20.d
-; VBITS_GE_256-NEXT: fcmeq p7.d, p0/z, z0.d, z22.d
-; VBITS_GE_256-NEXT: fcmeq p8.d, p0/z, z7.d, z23.d
-; VBITS_GE_256-NEXT: sel z0.d, p7, z0.d, z22.d
-; VBITS_GE_256-NEXT: sel z1.d, p6, z1.d, z20.d
-; VBITS_GE_256-NEXT: sel z2.d, p5, z2.d, z21.d
-; VBITS_GE_256-NEXT: sel z3.d, p4, z3.d, z18.d
-; VBITS_GE_256-NEXT: sel z4.d, p3, z4.d, z19.d
-; VBITS_GE_256-NEXT: sel z5.d, p2, z5.d, z16.d
-; VBITS_GE_256-NEXT: sel z6.d, p1, z6.d, z17.d
-; VBITS_GE_256-NEXT: sel z7.d, p8, z7.d, z23.d
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: select_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
-; VBITS_GE_2048-NEXT: sel z0.d, p1, z0.d, z1.d
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%mask = fcmp oeq <32 x double> %op1, %op2
@@ -599,4 +345,4 @@ define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
ret void
}
-attributes #0 = { "target-features"="+sve" uwtable }
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
index 695697ee5930..456d9fe2fd40 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
@@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@@ -24,49 +10,66 @@ target triple = "aarch64-unknown-linux-gnu"
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @insertelement_v4f16(<4 x half> %op1) #0 {
-; VBITS_GE_256-LABEL: insertelement_v4f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: fmov h1, #5.00000000
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $q0
-; VBITS_GE_256-NEXT: mov v0.h[3], v1.h[0]
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
-; VBITS_GE_256-NEXT: ret
+define <4 x half> @insertelement_v4f16(<4 x half> %op1) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov h1, #5.00000000
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.h[3], v1.h[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%r = insertelement <4 x half> %op1, half 5.0, i64 3
ret <4 x half> %r
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @insertelement_v8f16(<8 x half> %op1) #0 {
-; VBITS_GE_256-LABEL: insertelement_v8f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: fmov h1, #5.00000000
-; VBITS_GE_256-NEXT: mov v0.h[7], v1.h[0]
-; VBITS_GE_256-NEXT: ret
+define <8 x half> @insertelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov h1, #5.00000000
+; CHECK-NEXT: mov v0.h[7], v1.h[0]
+; CHECK-NEXT: ret
%r = insertelement <8 x half> %op1, half 5.0, i64 7
ret <8 x half> %r
}
-define <16 x half> @insertelement_v16f16(<16 x half>* %a) #0 {
-; VBITS_GE_256-LABEL: insertelement_v16f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w9, #15
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: fmov h2, #5.00000000
-; VBITS_GE_256-NEXT: index z3.h, #0, #1
-; VBITS_GE_256-NEXT: ptrue p1.h
-; VBITS_GE_256-NEXT: mov z1.h, w9
-; VBITS_GE_256-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
-; VBITS_GE_256-NEXT: mov z0.h, p1/m, h2
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
+define <16 x half> @insertelement_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w9, #15
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fmov h2, #5.00000000
+; CHECK-NEXT: index z3.h, #0, #1
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: mov z1.h, w9
+; CHECK-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
+; CHECK-NEXT: mov z0.h, p1/m, h2
+; CHECK-NEXT: st1h { z0.h }, p0, [x8]
+; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%r = insertelement <16 x half> %op1, half 5.0, i64 15
ret <16 x half> %r
}
define <32 x half> @insertelement_v32f16(<32 x half>* %a) #0 {
+; VBITS_GE_256-LABEL: insertelement_v32f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #16
+; VBITS_GE_256-NEXT: mov w10, #15
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: fmov h3, #5.00000000
+; VBITS_GE_256-NEXT: index z4.h, #0, #1
+; VBITS_GE_256-NEXT: ptrue p1.h
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: mov z2.h, w10
+; VBITS_GE_256-NEXT: cmpeq p1.h, p1/z, z4.h, z2.h
+; VBITS_GE_256-NEXT: mov z0.h, p1/m, h3
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: insertelement_v32f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov w9, #31
@@ -85,88 +88,105 @@ define <32 x half> @insertelement_v32f16(<32 x half>* %a) #0 {
ret <32 x half> %r
}
-define <64 x half> @insertelement_v64f16(<64 x half>* %a) #0 {
-; VBITS_GE_1024-LABEL: insertelement_v64f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: mov w9, #63
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: fmov h2, #5.00000000
-; VBITS_GE_1024-NEXT: index z3.h, #0, #1
-; VBITS_GE_1024-NEXT: ptrue p1.h
-; VBITS_GE_1024-NEXT: mov z1.h, w9
-; VBITS_GE_1024-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
-; VBITS_GE_1024-NEXT: mov z0.h, p1/m, h2
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x8]
-; VBITS_GE_1024-NEXT: ret
+define <64 x half> @insertelement_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: insertelement_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w9, #63
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fmov h2, #5.00000000
+; CHECK-NEXT: index z3.h, #0, #1
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: mov z1.h, w9
+; CHECK-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
+; CHECK-NEXT: mov z0.h, p1/m, h2
+; CHECK-NEXT: st1h { z0.h }, p0, [x8]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%r = insertelement <64 x half> %op1, half 5.0, i64 63
ret <64 x half> %r
}
-define <128 x half> @insertelement_v128f16(<128 x half>* %a) #0 {
-; VBITS_GE_2048-LABEL: insertelement_v128f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: mov w9, #127
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: fmov h2, #5.00000000
-; VBITS_GE_2048-NEXT: index z3.h, #0, #1
-; VBITS_GE_2048-NEXT: ptrue p1.h
-; VBITS_GE_2048-NEXT: mov z1.h, w9
-; VBITS_GE_2048-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
-; VBITS_GE_2048-NEXT: mov z0.h, p1/m, h2
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <128 x half> @insertelement_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: insertelement_v128f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w9, #127
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: fmov h2, #5.00000000
+; CHECK-NEXT: index z3.h, #0, #1
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: mov z1.h, w9
+; CHECK-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
+; CHECK-NEXT: mov z0.h, p1/m, h2
+; CHECK-NEXT: st1h { z0.h }, p0, [x8]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%r = insertelement <128 x half> %op1, half 5.0, i64 127
ret <128 x half> %r
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @insertelement_v2f32(<2 x float> %op1) #0 {
-; VBITS_GE_256-LABEL: insertelement_v2f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: fmov s1, #5.00000000
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $q0
-; VBITS_GE_256-NEXT: mov v0.s[1], v1.s[0]
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
-; VBITS_GE_256-NEXT: ret
+define <2 x float> @insertelement_v2f32(<2 x float> %op1) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s1, #5.00000000
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%r = insertelement <2 x float> %op1, float 5.0, i64 1
ret <2 x float> %r
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @insertelement_v4f32(<4 x float> %op1) #0 {
-; VBITS_GE_256-LABEL: insertelement_v4f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: fmov s1, #5.00000000
-; VBITS_GE_256-NEXT: mov v0.s[3], v1.s[0]
-; VBITS_GE_256-NEXT: ret
+define <4 x float> @insertelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s1, #5.00000000
+; CHECK-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-NEXT: ret
%r = insertelement <4 x float> %op1, float 5.0, i64 3
ret <4 x float> %r
}
-define <8 x float> @insertelement_v8f32(<8 x float>* %a) #0 {
-; VBITS_GE_256-LABEL: insertelement_v8f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w9, #7
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: fmov s2, #5.00000000
-; VBITS_GE_256-NEXT: index z3.s, #0, #1
-; VBITS_GE_256-NEXT: ptrue p1.s
-; VBITS_GE_256-NEXT: mov z1.s, w9
-; VBITS_GE_256-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
-; VBITS_GE_256-NEXT: mov z0.s, p1/m, s2
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
+define <8 x float> @insertelement_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w9, #7
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fmov s2, #5.00000000
+; CHECK-NEXT: index z3.s, #0, #1
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mov z1.s, w9
+; CHECK-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
+; CHECK-NEXT: mov z0.s, p1/m, s2
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%r = insertelement <8 x float> %op1, float 5.0, i64 7
ret <8 x float> %r
}
define <16 x float> @insertelement_v16f32(<16 x float>* %a) #0 {
+; VBITS_GE_256-LABEL: insertelement_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: mov w10, #7
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: fmov s3, #5.00000000
+; VBITS_GE_256-NEXT: index z4.s, #0, #1
+; VBITS_GE_256-NEXT: ptrue p1.s
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: mov z2.s, w10
+; VBITS_GE_256-NEXT: cmpeq p1.s, p1/z, z4.s, z2.s
+; VBITS_GE_256-NEXT: mov z0.s, p1/m, s3
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: insertelement_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov w9, #15
@@ -185,86 +205,103 @@ define <16 x float> @insertelement_v16f32(<16 x float>* %a) #0 {
ret <16 x float> %r
}
-define <32 x float> @insertelement_v32f32(<32 x float>* %a) #0 {
-; VBITS_GE_1024-LABEL: insertelement_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: mov w9, #31
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: fmov s2, #5.00000000
-; VBITS_GE_1024-NEXT: index z3.s, #0, #1
-; VBITS_GE_1024-NEXT: ptrue p1.s
-; VBITS_GE_1024-NEXT: mov z1.s, w9
-; VBITS_GE_1024-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
-; VBITS_GE_1024-NEXT: mov z0.s, p1/m, s2
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_1024-NEXT: ret
+define <32 x float> @insertelement_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: insertelement_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w9, #31
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fmov s2, #5.00000000
+; CHECK-NEXT: index z3.s, #0, #1
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mov z1.s, w9
+; CHECK-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
+; CHECK-NEXT: mov z0.s, p1/m, s2
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%r = insertelement <32 x float> %op1, float 5.0, i64 31
ret <32 x float> %r
}
-define <64 x float> @insertelement_v64f32(<64 x float>* %a) #0 {
-; VBITS_GE_2048-LABEL: insertelement_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: mov w9, #63
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: fmov s2, #5.00000000
-; VBITS_GE_2048-NEXT: index z3.s, #0, #1
-; VBITS_GE_2048-NEXT: ptrue p1.s
-; VBITS_GE_2048-NEXT: mov z1.s, w9
-; VBITS_GE_2048-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
-; VBITS_GE_2048-NEXT: mov z0.s, p1/m, s2
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <64 x float> @insertelement_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: insertelement_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w9, #63
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: fmov s2, #5.00000000
+; CHECK-NEXT: index z3.s, #0, #1
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mov z1.s, w9
+; CHECK-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
+; CHECK-NEXT: mov z0.s, p1/m, s2
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%r = insertelement <64 x float> %op1, float 5.0, i64 63
ret <64 x float> %r
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @insertelement_v1f64(<1 x double> %op1) #0 {
-; VBITS_GE_256-LABEL: insertelement_v1f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4617315517961601024
-; VBITS_GE_256-NEXT: fmov d0, x8
-; VBITS_GE_256-NEXT: ret
+define <1 x double> @insertelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #4617315517961601024
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
%r = insertelement <1 x double> %op1, double 5.0, i64 0
ret <1 x double> %r
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @insertelement_v2f64(<2 x double> %op1) #0 {
-; VBITS_GE_256-LABEL: insertelement_v2f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: fmov d1, #5.00000000
-; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0]
-; VBITS_GE_256-NEXT: ret
+define <2 x double> @insertelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov d1, #5.00000000
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ret
%r = insertelement <2 x double> %op1, double 5.0, i64 1
ret <2 x double> %r
}
-define <4 x double> @insertelement_v4f64(<4 x double>* %a) #0 {
-; VBITS_GE_256-LABEL: insertelement_v4f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w9, #3
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: fmov d2, #5.00000000
-; VBITS_GE_256-NEXT: index z3.d, #0, #1
-; VBITS_GE_256-NEXT: ptrue p1.d
-; VBITS_GE_256-NEXT: mov z1.d, x9
-; VBITS_GE_256-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
-; VBITS_GE_256-NEXT: mov z0.d, p1/m, d2
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
-; VBITS_GE_256-NEXT: ret
+define <4 x double> @insertelement_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w9, #3
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fmov d2, #5.00000000
+; CHECK-NEXT: index z3.d, #0, #1
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: mov z1.d, x9
+; CHECK-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
+; CHECK-NEXT: mov z0.d, p1/m, d2
+; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%r = insertelement <4 x double> %op1, double 5.0, i64 3
ret <4 x double> %r
}
define <8 x double> @insertelement_v8f64(<8 x double>* %a) #0 {
+; VBITS_GE_256-LABEL: insertelement_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: mov w10, #3
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: fmov d3, #5.00000000
+; VBITS_GE_256-NEXT: index z4.d, #0, #1
+; VBITS_GE_256-NEXT: ptrue p1.d
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: mov z2.d, x10
+; VBITS_GE_256-NEXT: cmpeq p1.d, p1/z, z4.d, z2.d
+; VBITS_GE_256-NEXT: mov z0.d, p1/m, d3
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: insertelement_v8f64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov w9, #7
@@ -283,39 +320,39 @@ define <8 x double> @insertelement_v8f64(<8 x double>* %a) #0 {
ret <8 x double> %r
}
-define <16 x double> @insertelement_v16f64(<16 x double>* %a) #0 {
-; VBITS_GE_1024-LABEL: insertelement_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: mov w9, #15
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: fmov d2, #5.00000000
-; VBITS_GE_1024-NEXT: index z3.d, #0, #1
-; VBITS_GE_1024-NEXT: ptrue p1.d
-; VBITS_GE_1024-NEXT: mov z1.d, x9
-; VBITS_GE_1024-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
-; VBITS_GE_1024-NEXT: mov z0.d, p1/m, d2
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8]
-; VBITS_GE_1024-NEXT: ret
+define <16 x double> @insertelement_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: insertelement_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w9, #15
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fmov d2, #5.00000000
+; CHECK-NEXT: index z3.d, #0, #1
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: mov z1.d, x9
+; CHECK-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
+; CHECK-NEXT: mov z0.d, p1/m, d2
+; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%r = insertelement <16 x double> %op1, double 5.0, i64 15
ret <16 x double> %r
}
-define <32 x double> @insertelement_v32f64(<32 x double>* %a) #0 {
-; VBITS_GE_2048-LABEL: insertelement_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: mov w9, #31
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: fmov d2, #5.00000000
-; VBITS_GE_2048-NEXT: index z3.d, #0, #1
-; VBITS_GE_2048-NEXT: ptrue p1.d
-; VBITS_GE_2048-NEXT: mov z1.d, x9
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
-; VBITS_GE_2048-NEXT: mov z0.d, p1/m, d2
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <32 x double> @insertelement_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: insertelement_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w9, #31
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: fmov d2, #5.00000000
+; CHECK-NEXT: index z3.d, #0, #1
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: mov z1.d, x9
+; CHECK-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
+; CHECK-NEXT: mov z0.d, p1/m, d2
+; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%r = insertelement <32 x double> %op1, double 5.0, i64 31
ret <32 x double> %r
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
index 1d94566f1a8a..09d7595b205b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
@@ -1,24 +1,7 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=VBITS_EQ_128
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
-
-; VBYTES represents the useful byte size of a vector register from the code
-; generator's point of view. It is clamped to power-of-2 values because
-; only power-of-2 vector lengths are considered legal, regardless of the
-; user specified vector length.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@@ -27,31 +10,34 @@ target triple = "aarch64-unknown-linux-gnu"
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v8i8:
-; CHECK: add v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = add <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v16i8:
-; CHECK: add v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = add <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = add <32 x i8> %op1, %op2
@@ -60,18 +46,28 @@ define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: add_v64i8:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-DAG: add [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_LE_256-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
-; VBITS_LE_256-DAG: add [[RES_1:z[0-9]+]].b, [[OP1_1]].b, [[OP2_1]].b
-; VBITS_LE_256-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
-; CHECK: ret
+; VBITS_GE_256-LABEL: add_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: add z0.b, z0.b, z2.b
+; VBITS_GE_256-NEXT: add z1.b, z1.b, z3.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: add_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: add z0.b, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = add <64 x i8> %op1, %op2
@@ -79,29 +75,15 @@ define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @add_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @add_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: add_v128i8:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-DAG: add [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_LE_512-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
-; VBITS_LE_512-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
-; VBITS_LE_512-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
-; VBITS_LE_512-DAG: add [[RES_1:z[0-9]+]].b, [[OP1_1]].b, [[OP2_1]].b
-; VBITS_LE_512-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
-; VBITS_LE_256-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]]
-; VBITS_LE_256-DAG: add [[RES_2:z[0-9]+]].b, [[OP1_2]].b, [[OP2_2]].b
-; VBITS_LE_256-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]]
-; VBITS_LE_256-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]]
-; VBITS_LE_256-DAG: add [[RES_3:z[0-9]+]].b, [[OP1_3]].b, [[OP2_3]].b
-; VBITS_LE_256-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = add <128 x i8> %op1, %op2
@@ -109,49 +91,15 @@ define void @add_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @add_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @add_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: add_v256i8:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-DAG: add [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_LE_1024-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
-; VBITS_LE_1024-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
-; VBITS_LE_1024-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
-; VBITS_LE_1024-DAG: add [[RES_1:z[0-9]+]].b, [[OP1_1]].b, [[OP2_1]].b
-; VBITS_LE_1024-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
-; VBITS_LE_512-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]]
-; VBITS_LE_512-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]]
-; VBITS_LE_512-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]]
-; VBITS_LE_512-DAG: add [[RES_2:z[0-9]+]].b, [[OP1_2]].b, [[OP2_2]].b
-; VBITS_LE_512-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]]
-; VBITS_LE_512-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]]
-; VBITS_LE_512-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]]
-; VBITS_LE_512-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]]
-; VBITS_LE_512-DAG: add [[RES_3:z[0-9]+]].b, [[OP1_3]].b, [[OP2_3]].b
-; VBITS_LE_512-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]]
-; VBITS_LE_256-DAG: mov w[[OFF_4:[0-9]+]], #[[#mul(VBYTES,4)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_4:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_4]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_4:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_4]]]
-; VBITS_LE_256-DAG: add [[RES_4:z[0-9]+]].b, [[OP1_4]].b, [[OP2_4]].b
-; VBITS_LE_256-DAG: st1b { [[RES_4]].b }, [[PG]], [x0, x[[OFF_4]]]
-; VBITS_LE_256-DAG: mov w[[OFF_5:[0-9]+]], #[[#mul(VBYTES,5)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_5:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_5]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_5:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_5]]]
-; VBITS_LE_256-DAG: add [[RES_5:z[0-9]+]].b, [[OP1_5]].b, [[OP2_5]].b
-; VBITS_LE_256-DAG: st1b { [[RES_5]].b }, [[PG]], [x0, x[[OFF_5]]]
-; VBITS_LE_256-DAG: mov w[[OFF_6:[0-9]+]], #[[#mul(VBYTES,6)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_6:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_6]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_6:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_6]]]
-; VBITS_LE_256-DAG: add [[RES_6:z[0-9]+]].b, [[OP1_6]].b, [[OP2_6]].b
-; VBITS_LE_256-DAG: st1b { [[RES_6]].b }, [[PG]], [x0, x[[OFF_6]]]
-; VBITS_LE_256-DAG: mov w[[OFF_7:[0-9]+]], #[[#mul(VBYTES,7)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_7:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_7]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_7:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_7]]]
-; VBITS_LE_256-DAG: add [[RES_7:z[0-9]+]].b, [[OP1_7]].b, [[OP2_7]].b
-; VBITS_LE_256-DAG: st1b { [[RES_7]].b }, [[PG]], [x0, x[[OFF_7]]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = add <256 x i8> %op1, %op2
@@ -160,31 +108,34 @@ define void @add_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v4i16:
-; CHECK: add v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = add <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v8i16:
-; CHECK: add v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = add <8 x i16> %op1, %op2
ret <8 x i16> %res
}
-define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: add z0.h, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = add <16 x i16> %op1, %op2
@@ -192,16 +143,29 @@ define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: add_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: add_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: add z0.h, z0.h, z2.h
+; VBITS_GE_256-NEXT: add z1.h, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: add_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: add z0.h, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = add <32 x i16> %op1, %op2
@@ -209,16 +173,15 @@ define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
-define void @add_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @add_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: add_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: add z0.h, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = add <64 x i16> %op1, %op2
@@ -226,16 +189,15 @@ define void @add_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
-define void @add_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @add_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: add_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: add z0.h, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = add <128 x i16> %op1, %op2
@@ -244,31 +206,34 @@ define void @add_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v2i32:
-; CHECK: add v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = add <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v4i32:
-; CHECK: add v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = add <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = add <8 x i32> %op1, %op2
@@ -276,16 +241,29 @@ define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
define void @add_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: add_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: add_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: add z0.s, z0.s, z2.s
+; VBITS_GE_256-NEXT: add z1.s, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: add_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: add z0.s, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = add <16 x i32> %op1, %op2
@@ -293,16 +271,15 @@ define void @add_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
-define void @add_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @add_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: add_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = add <32 x i32> %op1, %op2
@@ -310,16 +287,15 @@ define void @add_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
-define void @add_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @add_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: add_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = add <64 x i32> %op1, %op2
@@ -328,31 +304,34 @@ define void @add_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v1i64:
-; CHECK: add d0, d0, d1
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: add d0, d0, d1
+; CHECK-NEXT: ret
%res = add <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v2i64:
-; CHECK: add v0.2d, v0.2d, v1.2d
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = add <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: add z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = add <4 x i64> %op1, %op2
@@ -360,16 +339,29 @@ define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
define void @add_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: add_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: add_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: add z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: add z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: add_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: add z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = add <8 x i64> %op1, %op2
@@ -377,16 +369,15 @@ define void @add_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
-define void @add_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @add_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: add_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: add z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = add <16 x i64> %op1, %op2
@@ -394,16 +385,20 @@ define void @add_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
-define void @add_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @add_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: add_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #16
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1]
+; CHECK-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEXT: add z1.d, z1.d, z3.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; CHECK-NEXT: st1d { z1.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = add <32 x i64> %op1, %op2
@@ -411,41 +406,39 @@ define void @add_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
ret void
}
-;
-; NOTE: Tests beyond this point only have CHECK lines to validate the first
-; VBYTES because the add tests already validate the legalisation code paths.
-;
-
;
; MUL
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v8i8:
-; CHECK: mul v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: mul v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = mul <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v16i8:
-; CHECK: mul v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = mul <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @mul_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @mul_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = mul <32 x i8> %op1, %op2
@@ -454,13 +447,28 @@ define void @mul_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @mul_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: mul_v64i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: mul_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT: mul z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: mul_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: mul z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = mul <64 x i8> %op1, %op2
@@ -468,14 +476,15 @@ define void @mul_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @mul_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @mul_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: mul_v128i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = mul <128 x i8> %op1, %op2
@@ -483,14 +492,15 @@ define void @mul_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @mul_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @mul_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: mul_v256i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = mul <256 x i8> %op1, %op2
@@ -499,31 +509,34 @@ define void @mul_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v4i16:
-; CHECK: mul v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = mul <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v8i16:
-; CHECK: mul v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = mul <8 x i16> %op1, %op2
ret <8 x i16> %res
}
-define void @mul_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @mul_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = mul <16 x i16> %op1, %op2
@@ -532,13 +545,28 @@ define void @mul_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @mul_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: mul_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: mul_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: mul_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: mul z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = mul <32 x i16> %op1, %op2
@@ -546,14 +574,15 @@ define void @mul_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @mul_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @mul_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: mul_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = mul <64 x i16> %op1, %op2
@@ -561,14 +590,15 @@ define void @mul_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @mul_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @mul_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: mul_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = mul <128 x i16> %op1, %op2
@@ -577,31 +607,34 @@ define void @mul_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v2i32:
-; CHECK: mul v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = mul <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v4i32:
-; CHECK: mul v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = mul <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @mul_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @mul_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = mul <8 x i32> %op1, %op2
@@ -610,13 +643,28 @@ define void @mul_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @mul_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: mul_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: mul_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: mul_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: mul z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = mul <16 x i32> %op1, %op2
@@ -624,14 +672,15 @@ define void @mul_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @mul_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @mul_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: mul_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = mul <32 x i32> %op1, %op2
@@ -639,14 +688,15 @@ define void @mul_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @mul_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @mul_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: mul_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = mul <64 x i32> %op1, %op2
@@ -656,42 +706,39 @@ define void @mul_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
; CHECK-LABEL: mul_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: mul_v1i64:
-; VBITS_EQ_128: ptrue p0.d, vl1
-; VBITS_EQ_128: mul z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = mul <1 x i64> %op1, %op2
ret <1 x i64> %res
}
define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
; CHECK-LABEL: mul_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: mul_v2i64:
-; VBITS_EQ_128: ptrue p0.d, vl2
-; VBITS_EQ_128: mul z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = mul <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @mul_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @mul_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = mul <4 x i64> %op1, %op2
@@ -700,13 +747,28 @@ define void @mul_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @mul_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: mul_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: mul_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: mul_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = mul <8 x i64> %op1, %op2
@@ -714,14 +776,15 @@ define void @mul_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @mul_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @mul_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: mul_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = mul <16 x i64> %op1, %op2
@@ -729,14 +792,15 @@ define void @mul_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @mul_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @mul_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: mul_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = mul <32 x i64> %op1, %op2
@@ -749,31 +813,34 @@ define void @mul_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v8i8:
-; CHECK: sub v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = sub <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v16i8:
-; CHECK: sub v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = sub <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @sub_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @sub_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: sub z0.b, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = sub <32 x i8> %op1, %op2
@@ -782,13 +849,28 @@ define void @sub_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @sub_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: sub_v64i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: sub_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: sub z0.b, z0.b, z2.b
+; VBITS_GE_256-NEXT: sub z1.b, z1.b, z3.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sub_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: sub z0.b, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = sub <64 x i8> %op1, %op2
@@ -796,14 +878,15 @@ define void @sub_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @sub_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @sub_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sub_v128i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: sub z0.b, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = sub <128 x i8> %op1, %op2
@@ -811,14 +894,15 @@ define void @sub_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @sub_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @sub_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sub_v256i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: sub z0.b, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = sub <256 x i8> %op1, %op2
@@ -827,31 +911,34 @@ define void @sub_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v4i16:
-; CHECK: sub v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = sub <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v8i16:
-; CHECK: sub v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = sub <8 x i16> %op1, %op2
ret <8 x i16> %res
}
-define void @sub_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @sub_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: sub z0.h, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = sub <16 x i16> %op1, %op2
@@ -860,13 +947,28 @@ define void @sub_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @sub_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: sub_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: sub_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: sub z0.h, z0.h, z2.h
+; VBITS_GE_256-NEXT: sub z1.h, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sub_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: sub z0.h, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = sub <32 x i16> %op1, %op2
@@ -874,14 +976,15 @@ define void @sub_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @sub_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @sub_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sub_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: sub z0.h, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = sub <64 x i16> %op1, %op2
@@ -889,14 +992,15 @@ define void @sub_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @sub_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @sub_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sub_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: sub z0.h, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = sub <128 x i16> %op1, %op2
@@ -905,31 +1009,34 @@ define void @sub_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v2i32:
-; CHECK: sub v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = sub <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v4i32:
-; CHECK: sub v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = sub <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @sub_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @sub_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: sub z0.s, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = sub <8 x i32> %op1, %op2
@@ -938,13 +1045,28 @@ define void @sub_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @sub_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: sub_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: sub_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: sub z0.s, z0.s, z2.s
+; VBITS_GE_256-NEXT: sub z1.s, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sub_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: sub z0.s, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = sub <16 x i32> %op1, %op2
@@ -952,14 +1074,15 @@ define void @sub_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @sub_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @sub_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sub_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: sub z0.s, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = sub <32 x i32> %op1, %op2
@@ -967,14 +1090,15 @@ define void @sub_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @sub_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @sub_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sub_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: sub z0.s, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = sub <64 x i32> %op1, %op2
@@ -983,31 +1107,34 @@ define void @sub_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v1i64:
-; CHECK: sub d0, d0, d1
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub d0, d0, d1
+; CHECK-NEXT: ret
%res = sub <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v2i64:
-; CHECK: sub v0.2d, v0.2d, v1.2d
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = sub <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @sub_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @sub_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: sub z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = sub <4 x i64> %op1, %op2
@@ -1016,13 +1143,28 @@ define void @sub_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @sub_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: sub_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: sub_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: sub z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: sub z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sub_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: sub z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = sub <8 x i64> %op1, %op2
@@ -1030,14 +1172,15 @@ define void @sub_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @sub_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @sub_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sub_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: sub z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = sub <16 x i64> %op1, %op2
@@ -1045,14 +1188,15 @@ define void @sub_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @sub_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @sub_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sub_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: sub z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = sub <32 x i64> %op1, %op2
@@ -1066,30 +1210,33 @@ define void @sub_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @abs_v8i8(<8 x i8> %op1) #0 {
+define <8 x i8> @abs_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v8i8:
-; CHECK: abs v0.8b, v0.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: abs v0.8b, v0.8b
+; CHECK-NEXT: ret
%res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false)
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @abs_v16i8(<16 x i8> %op1) #0 {
+define <16 x i8> @abs_v16i8(<16 x i8> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v16i8:
-; CHECK: abs v0.16b, v0.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: abs v0.16b, v0.16b
+; CHECK-NEXT: ret
%res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false)
ret <16 x i8> %res
}
-define void @abs_v32i8(<32 x i8>* %a) #0 {
+define void @abs_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: abs z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false)
store <32 x i8> %res, <32 x i8>* %a
@@ -1097,38 +1244,53 @@ define void @abs_v32i8(<32 x i8>* %a) #0 {
}
define void @abs_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: abs_v64i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: abs_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: abs z0.b, p0/m, z0.b
+; VBITS_GE_256-NEXT: abs z1.b, p0/m, z1.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: abs_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: abs z0.b, p0/m, z0.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%res = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %op1, i1 false)
store <64 x i8> %res, <64 x i8>* %a
ret void
}
-define void @abs_v128i8(<128 x i8>* %a) #0 {
+define void @abs_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: abs_v128i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: abs z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%res = call <128 x i8> @llvm.abs.v128i8(<128 x i8> %op1, i1 false)
store <128 x i8> %res, <128 x i8>* %a
ret void
}
-define void @abs_v256i8(<256 x i8>* %a) #0 {
+define void @abs_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: abs_v256i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: abs z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%res = call <256 x i8> @llvm.abs.v256i8(<256 x i8> %op1, i1 false)
store <256 x i8> %res, <256 x i8>* %a
@@ -1136,69 +1298,119 @@ define void @abs_v256i8(<256 x i8>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @abs_v4i16(<4 x i16> %op1) #0 {
+define <4 x i16> @abs_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v4i16:
-; CHECK: abs v0.4h, v0.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: abs v0.4h, v0.4h
+; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false)
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @abs_v8i16(<8 x i16> %op1) #0 {
+define <8 x i16> @abs_v8i16(<8 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v8i16:
-; CHECK: abs v0.8h, v0.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: abs v0.8h, v0.8h
+; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false)
ret <8 x i16> %res
}
-define void @abs_v16i16(<16 x i16>* %a) #0 {
+define void @abs_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: abs z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false)
store <16 x i16> %res, <16 x i16>* %a
ret void
}
-define void @abs_v32i16(<32 x i16>* %a) #0 {
+define void @abs_v32i16(<32 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #16
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: abs z0.h, p0/m, z0.h
+; CHECK-NEXT: abs z1.h, p0/m, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; CHECK-NEXT: st1h { z1.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %op1, i1 false)
store <32 x i16> %res, <32 x i16>* %a
ret void
}
-define void @abs_v64i16(<64 x i16>* %a) #0 {
+define void @abs_v64i16(<64 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #48
+; CHECK-NEXT: mov x9, #16
+; CHECK-NEXT: mov x10, #32
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
+; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0]
+; CHECK-NEXT: abs z1.h, p0/m, z1.h
+; CHECK-NEXT: abs z0.h, p0/m, z0.h
+; CHECK-NEXT: abs z2.h, p0/m, z2.h
+; CHECK-NEXT: abs z3.h, p0/m, z3.h
+; CHECK-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
+; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; CHECK-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
+; CHECK-NEXT: st1h { z3.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%res = call <64 x i16> @llvm.abs.v64i16(<64 x i16> %op1, i1 false)
store <64 x i16> %res, <64 x i16>* %a
ret void
}
-define void @abs_v128i16(<128 x i16>* %a) #0 {
+define void @abs_v128i16(<128 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #96
+; CHECK-NEXT: mov x9, #48
+; CHECK-NEXT: mov x10, #16
+; CHECK-NEXT: mov x11, #80
+; CHECK-NEXT: mov x12, #32
+; CHECK-NEXT: mov x13, #112
+; CHECK-NEXT: mov x14, #64
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
+; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
+; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1]
+; CHECK-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1]
+; CHECK-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1]
+; CHECK-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT: ld1h { z7.h }, p0/z, [x0]
+; CHECK-NEXT: abs z1.h, p0/m, z1.h
+; CHECK-NEXT: abs z0.h, p0/m, z0.h
+; CHECK-NEXT: abs z3.h, p0/m, z3.h
+; CHECK-NEXT: abs z2.h, p0/m, z2.h
+; CHECK-NEXT: abs z5.h, p0/m, z5.h
+; CHECK-NEXT: abs z4.h, p0/m, z4.h
+; CHECK-NEXT: abs z6.h, p0/m, z6.h
+; CHECK-NEXT: abs z7.h, p0/m, z7.h
+; CHECK-NEXT: st1h { z6.h }, p0, [x0, x8, lsl #1]
+; CHECK-NEXT: st1h { z4.h }, p0, [x0, x13, lsl #1]
+; CHECK-NEXT: st1h { z5.h }, p0, [x0, x14, lsl #1]
+; CHECK-NEXT: st1h { z2.h }, p0, [x0, x11, lsl #1]
+; CHECK-NEXT: st1h { z3.h }, p0, [x0, x12, lsl #1]
+; CHECK-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
+; CHECK-NEXT: st1h { z1.h }, p0, [x0, x10, lsl #1]
+; CHECK-NEXT: st1h { z7.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%res = call <128 x i16> @llvm.abs.v128i16(<128 x i16> %op1, i1 false)
store <128 x i16> %res, <128 x i16>* %a
@@ -1206,30 +1418,33 @@ define void @abs_v128i16(<128 x i16>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @abs_v2i32(<2 x i32> %op1) #0 {
+define <2 x i32> @abs_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v2i32:
-; CHECK: abs v0.2s, v0.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: abs v0.2s, v0.2s
+; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @abs_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @abs_v4i32(<4 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v4i32:
-; CHECK: abs v0.4s, v0.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: abs v0.4s, v0.4s
+; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
ret <4 x i32> %res
}
-define void @abs_v8i32(<8 x i32>* %a) #0 {
+define void @abs_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: abs z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
store <8 x i32> %res, <8 x i32>* %a
@@ -1237,38 +1452,53 @@ define void @abs_v8i32(<8 x i32>* %a) #0 {
}
define void @abs_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: abs_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: abs_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: abs z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: abs z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: abs_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: abs z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%res = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %op1, i1 false)
store <16 x i32> %res, <16 x i32>* %a
ret void
}
-define void @abs_v32i32(<32 x i32>* %a) #0 {
+define void @abs_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: abs_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: abs z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%res = call <32 x i32> @llvm.abs.v32i32(<32 x i32> %op1, i1 false)
store <32 x i32> %res, <32 x i32>* %a
ret void
}
-define void @abs_v64i32(<64 x i32>* %a) #0 {
+define void @abs_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: abs_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: abs z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%res = call <64 x i32> @llvm.abs.v64i32(<64 x i32> %op1, i1 false)
store <64 x i32> %res, <64 x i32>* %a
@@ -1276,30 +1506,33 @@ define void @abs_v64i32(<64 x i32>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @abs_v1i64(<1 x i64> %op1) #0 {
+define <1 x i64> @abs_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v1i64:
-; CHECK: abs d0, d0
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: abs d0, d0
+; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false)
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @abs_v2i64(<2 x i64> %op1) #0 {
+define <2 x i64> @abs_v2i64(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v2i64:
-; CHECK: abs v0.2d, v0.2d
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: abs v0.2d, v0.2d
+; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
ret <2 x i64> %res
}
-define void @abs_v4i64(<4 x i64>* %a) #0 {
+define void @abs_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: abs z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
store <4 x i64> %res, <4 x i64>* %a
@@ -1307,38 +1540,53 @@ define void @abs_v4i64(<4 x i64>* %a) #0 {
}
define void @abs_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: abs_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: abs_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: abs z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: abs z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: abs_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: abs z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%res = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %op1, i1 false)
store <8 x i64> %res, <8 x i64>* %a
ret void
}
-define void @abs_v16i64(<16 x i64>* %a) #0 {
+define void @abs_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: abs_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: abs z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%res = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %op1, i1 false)
store <16 x i64> %res, <16 x i64>* %a
ret void
}
-define void @abs_v32i64(<32 x i64>* %a) #0 {
+define void @abs_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: abs_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: abs z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%res = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %op1, i1 false)
store <32 x i64> %res, <32 x i64>* %a
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll
index 9cecfbe40b74..29b9392c77d0 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll
@@ -1,58 +1,46 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: z{0-9}
-
;
; ICMP EQ
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v8i8:
-; CHECK: cmeq v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmeq v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%cmp = icmp eq <8 x i8> %op1, %op2
%sext = sext <8 x i1> %cmp to <8 x i8>
ret <8 x i8> %sext
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v16i8:
-; CHECK: cmeq v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmeq v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%cmp = icmp eq <16 x i8> %op1, %op2
%sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %sext
}
-define void @icmp_eq_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @icmp_eq_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
-; CHECK-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%cmp = icmp eq <32 x i8> %op1, %op2
@@ -62,29 +50,31 @@ define void @icmp_eq_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @icmp_eq_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: icmp_eq_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].b, [[PG]]/z, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].b, [[PG]]/z, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].b, [[CMP_LO]]/z, #-1
-; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].b, [[CMP_HI]]/z, #-1
-; VBITS_EQ_256-DAG: st1b { [[SEXT_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[SEXT_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: icmp_eq_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z2.b
+; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z1.b, z3.b
+; VBITS_GE_256-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: mov z1.b, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: icmp_eq_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
+; VBITS_GE_512-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%cmp = icmp eq <64 x i8> %op1, %op2
@@ -93,15 +83,16 @@ define void @icmp_eq_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @icmp_eq_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @icmp_eq_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_eq_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
-; VBITS_GE_1024-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%cmp = icmp eq <128 x i8> %op1, %op2
@@ -110,15 +101,16 @@ define void @icmp_eq_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @icmp_eq_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @icmp_eq_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_eq_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
-; VBITS_GE_2048-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%cmp = icmp eq <256 x i8> %op1, %op2
@@ -128,34 +120,37 @@ define void @icmp_eq_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v4i16:
-; CHECK: cmeq v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmeq v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%cmp = icmp eq <4 x i16> %op1, %op2
%sext = sext <4 x i1> %cmp to <4 x i16>
ret <4 x i16> %sext
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v8i16:
-; CHECK: cmeq v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmeq v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%cmp = icmp eq <8 x i16> %op1, %op2
%sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %sext
}
-define void @icmp_eq_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @icmp_eq_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
-; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%cmp = icmp eq <16 x i16> %op1, %op2
@@ -165,29 +160,31 @@ define void @icmp_eq_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @icmp_eq_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: icmp_eq_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].h, [[PG]]/z, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].h, [[PG]]/z, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].h, [[CMP_LO]]/z, #-1
-; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].h, [[CMP_HI]]/z, #-1
-; VBITS_EQ_256-DAG: st1h { [[SEXT_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[SEXT_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: icmp_eq_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h
+; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, z3.h
+; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: icmp_eq_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
+; VBITS_GE_512-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%cmp = icmp eq <32 x i16> %op1, %op2
@@ -196,15 +193,16 @@ define void @icmp_eq_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @icmp_eq_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @icmp_eq_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_eq_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
-; VBITS_GE_1024-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%cmp = icmp eq <64 x i16> %op1, %op2
@@ -213,15 +211,16 @@ define void @icmp_eq_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @icmp_eq_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @icmp_eq_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_eq_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
-; VBITS_GE_2048-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%cmp = icmp eq <128 x i16> %op1, %op2
@@ -231,34 +230,37 @@ define void @icmp_eq_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v2i32:
-; CHECK: cmeq v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmeq v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%cmp = icmp eq <2 x i32> %op1, %op2
%sext = sext <2 x i1> %cmp to <2 x i32>
ret <2 x i32> %sext
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v4i32:
-; CHECK: cmeq v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%cmp = icmp eq <4 x i32> %op1, %op2
%sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %sext
}
-define void @icmp_eq_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @icmp_eq_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
-; CHECK-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%cmp = icmp eq <8 x i32> %op1, %op2
@@ -268,29 +270,31 @@ define void @icmp_eq_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @icmp_eq_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: icmp_eq_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].s, [[PG]]/z, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].s, [[PG]]/z, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].s, [[CMP_LO]]/z, #-1
-; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].s, [[CMP_HI]]/z, #-1
-; VBITS_EQ_256-DAG: st1w { [[SEXT_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[SEXT_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: icmp_eq_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s
+; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, z3.s
+; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: icmp_eq_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
+; VBITS_GE_512-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%cmp = icmp eq <16 x i32> %op1, %op2
@@ -299,15 +303,16 @@ define void @icmp_eq_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @icmp_eq_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @icmp_eq_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_eq_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
-; VBITS_GE_1024-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%cmp = icmp eq <32 x i32> %op1, %op2
@@ -316,15 +321,16 @@ define void @icmp_eq_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @icmp_eq_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @icmp_eq_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_eq_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
-; VBITS_GE_2048-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%cmp = icmp eq <64 x i32> %op1, %op2
@@ -334,34 +340,37 @@ define void @icmp_eq_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v1i64:
-; CHECK: cmeq d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmeq d0, d0, d1
+; CHECK-NEXT: ret
%cmp = icmp eq <1 x i64> %op1, %op2
%sext = sext <1 x i1> %cmp to <1 x i64>
ret <1 x i64> %sext
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v2i64:
-; CHECK: cmeq v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmeq v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%cmp = icmp eq <2 x i64> %op1, %op2
%sext = sext <2 x i1> %cmp to <2 x i64>
ret <2 x i64> %sext
}
-define void @icmp_eq_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @icmp_eq_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; CHECK-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%cmp = icmp eq <4 x i64> %op1, %op2
@@ -371,29 +380,31 @@ define void @icmp_eq_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @icmp_eq_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: icmp_eq_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].d, [[PG]]/z, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].d, [[PG]]/z, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].d, [[CMP_LO]]/z, #-1
-; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].d, [[CMP_HI]]/z, #-1
-; VBITS_EQ_256-DAG: st1d { [[SEXT_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[SEXT_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: icmp_eq_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d
+; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z3.d
+; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: icmp_eq_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
+; VBITS_GE_512-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%cmp = icmp eq <8 x i64> %op1, %op2
@@ -402,15 +413,16 @@ define void @icmp_eq_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @icmp_eq_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @icmp_eq_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_eq_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; VBITS_GE_1024-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%cmp = icmp eq <16 x i64> %op1, %op2
@@ -419,15 +431,16 @@ define void @icmp_eq_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @icmp_eq_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @icmp_eq_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_eq_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; VBITS_GE_2048-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%cmp = icmp eq <32 x i64> %op1, %op2
@@ -440,15 +453,16 @@ define void @icmp_eq_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
; ICMP NE
;
-define void @icmp_ne_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @icmp_ne_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_ne_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpne [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
-; CHECK-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%cmp = icmp ne <32 x i8> %op1, %op2
@@ -461,15 +475,16 @@ define void @icmp_ne_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
; ICMP SGE
;
-define void @icmp_sge_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+define void @icmp_sge_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(4,0) #0 {
; CHECK-LABEL: icmp_sge_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmpge [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: cmpge p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%cmp = icmp sge <32 x i16> %op1, %op2
@@ -482,15 +497,16 @@ define void @icmp_sge_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
; ICMP SGT
;
-define void @icmp_sgt_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @icmp_sgt_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_sgt_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpgt [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
-; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: cmpgt p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%cmp = icmp sgt <16 x i16> %op1, %op2
@@ -503,15 +519,16 @@ define void @icmp_sgt_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
; ICMP SLE
;
-define void @icmp_sle_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
+define void @icmp_sle_v16i32(<16 x i32>* %a, <16 x i32>* %b) vscale_range(4,0) #0 {
; CHECK-LABEL: icmp_sle_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmpge [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP2]].s, [[OP1]].s
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: cmpge p1.s, p0/z, z1.s, z0.s
+; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%cmp = icmp sle <16 x i32> %op1, %op2
@@ -524,15 +541,16 @@ define void @icmp_sle_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
; ICMP SLT
;
-define void @icmp_slt_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @icmp_slt_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_slt_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpgt [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP2]].s, [[OP1]].s
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
-; CHECK-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: cmpgt p1.s, p0/z, z1.s, z0.s
+; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%cmp = icmp slt <8 x i32> %op1, %op2
@@ -545,15 +563,16 @@ define void @icmp_slt_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
; ICMP UGE
;
-define void @icmp_uge_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
+define void @icmp_uge_v8i64(<8 x i64>* %a, <8 x i64>* %b) vscale_range(4,0) #0 {
; CHECK-LABEL: icmp_uge_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmphs [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl8
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmphs p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%cmp = icmp uge <8 x i64> %op1, %op2
@@ -566,15 +585,16 @@ define void @icmp_uge_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
; ICMP UGT
;
-define void @icmp_ugt_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @icmp_ugt_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_ugt_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmphi [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; CHECK-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmphi p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%cmp = icmp ugt <4 x i64> %op1, %op2
@@ -587,15 +607,16 @@ define void @icmp_ugt_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
; ICMP ULE
;
-define void @icmp_ule_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @icmp_ule_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_ule_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: cmphs [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP2]].d, [[OP1]].d
-; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; VBITS_GE_1024-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmphs p1.d, p0/z, z1.d, z0.d
+; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%cmp = icmp ule <16 x i64> %op1, %op2
@@ -608,15 +629,16 @@ define void @icmp_ule_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
; ICMP ULT
;
-define void @icmp_ult_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @icmp_ult_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_ult_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: cmphi [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP2]].d, [[OP1]].d
-; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; VBITS_GE_2048-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmphi p1.d, p0/z, z1.d, z0.d
+; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%cmp = icmp ult <32 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
index 8b1bae5009a1..9c1e9577df16 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
@@ -1,19 +1,8 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=VBITS_EQ_128
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048,VBITS_EQ_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@@ -24,164 +13,164 @@ target triple = "aarch64-unknown-linux-gnu"
; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
-; CHECK-LABEL: sdiv_v8i8:
-; CHECK: ptrue [[PG0:p[0-9]+]].s, vl8
-; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, z1.b
-; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, z0.b
-; CHECK-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
-; CHECK-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
-; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG0]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[DIV]].h, [[DIV]].h
-; CHECK-NEXT: umov [[SCALAR0:w[0-9]+]], [[VEC:v[0-9]+]].h[0]
-; CHECK-NEXT: umov [[SCALAR1:w[0-9]+]], [[VEC]].h[1]
-; CHECK-NEXT: fmov s0, [[SCALAR0]]
-; CHECK-NEXT: umov [[SCALAR2:w[0-9]+]], [[VEC]].h[2]
-; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR1]]
-; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR2]]
-; CHECK-NEXT: umov [[SCALAR3:w[0-9]+]], [[VEC]].h[3]
-; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR3]]
-; CHECK-NEXT: umov [[SCALAR4:w[0-9]+]], [[VEC]].h[4]
-; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR4]]
-; CHECK-NEXT: umov [[SCALAR5:w[0-9]+]], [[VEC]].h[5]
-; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR5]]
-; CHECK-NEXT: umov [[SCALAR6:w[0-9]+]], [[VEC]].h[6]
-; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR6]]
-; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7]
-; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]]
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v8i8:
-; VBITS_EQ_128: sshll v1.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: sshll v0.8h, v0.8b, #0
-; VBITS_EQ_128-NEXT: sunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT: sunpklo z1.s, z1.h
-; VBITS_EQ_128-NEXT: sunpkhi z3.s, z0.h
-; VBITS_EQ_128-NEXT: sunpklo z0.s, z0.h
-; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT: uzp1 z0.h, z0.h, z2.h
-; VBITS_EQ_128-NEXT: xtn v0.8b, v0.8h
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: sdiv_v8i8:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0
+; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h
+; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT: sunpkhi z3.s, z0.h
+; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: sdiv_v8i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z1.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: umov w8, v1.h[0]
+; VBITS_GE_256-NEXT: umov w9, v1.h[1]
+; VBITS_GE_256-NEXT: fmov s0, w8
+; VBITS_GE_256-NEXT: umov w8, v1.h[2]
+; VBITS_GE_256-NEXT: mov v0.b[1], w9
+; VBITS_GE_256-NEXT: mov v0.b[2], w8
+; VBITS_GE_256-NEXT: umov w8, v1.h[3]
+; VBITS_GE_256-NEXT: mov v0.b[3], w8
+; VBITS_GE_256-NEXT: umov w8, v1.h[4]
+; VBITS_GE_256-NEXT: mov v0.b[4], w8
+; VBITS_GE_256-NEXT: umov w8, v1.h[5]
+; VBITS_GE_256-NEXT: mov v0.b[5], w8
+; VBITS_GE_256-NEXT: umov w8, v1.h[6]
+; VBITS_GE_256-NEXT: mov v0.b[6], w8
+; VBITS_GE_256-NEXT: umov w8, v1.h[7]
+; VBITS_GE_256-NEXT: mov v0.b[7], w8
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sdiv_v8i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b
+; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: uzp1 z1.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: umov w8, v1.h[0]
+; VBITS_GE_512-NEXT: umov w9, v1.h[1]
+; VBITS_GE_512-NEXT: fmov s0, w8
+; VBITS_GE_512-NEXT: umov w8, v1.h[2]
+; VBITS_GE_512-NEXT: mov v0.b[1], w9
+; VBITS_GE_512-NEXT: mov v0.b[2], w8
+; VBITS_GE_512-NEXT: umov w8, v1.h[3]
+; VBITS_GE_512-NEXT: mov v0.b[3], w8
+; VBITS_GE_512-NEXT: umov w8, v1.h[4]
+; VBITS_GE_512-NEXT: mov v0.b[4], w8
+; VBITS_GE_512-NEXT: umov w8, v1.h[5]
+; VBITS_GE_512-NEXT: mov v0.b[5], w8
+; VBITS_GE_512-NEXT: umov w8, v1.h[6]
+; VBITS_GE_512-NEXT: mov v0.b[6], w8
+; VBITS_GE_512-NEXT: umov w8, v1.h[7]
+; VBITS_GE_512-NEXT: mov v0.b[7], w8
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_512-NEXT: ret
%res = sdiv <8 x i8> %op1, %op2
ret <8 x i8> %res
}
define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
-; CHECK-LABEL: sdiv_v16i8:
-
-; HALF VECTOR:
-; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, z1.b
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, z0.b
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, z1.h
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, z0.h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
-; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_256-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, z1.b
-; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, z0.b
-; VBITS_GE_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
-; VBITS_GE_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
-; VBITS_GE_512-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV]].h, [[DIV]].h
-; VBITS_GE_512-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v16i8:
-; VBITS_EQ_128: sunpkhi z2.h, z1.b
-; VBITS_EQ_128-NEXT: sunpkhi z3.h, z0.b
-; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: sunpklo z1.h, z1.b
-; VBITS_EQ_128-NEXT: sunpkhi z4.s, z2.h
-; VBITS_EQ_128-NEXT: sunpkhi z5.s, z3.h
-; VBITS_EQ_128-NEXT: sunpklo z2.s, z2.h
-; VBITS_EQ_128-NEXT: sunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT: sunpklo z0.h, z0.b
-; VBITS_EQ_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: sunpkhi z3.s, z1.h
-; VBITS_EQ_128-NEXT: sunpkhi z5.s, z0.h
-; VBITS_EQ_128-NEXT: sunpklo z1.s, z1.h
-; VBITS_EQ_128-NEXT: sunpklo z0.s, z0.h
-; VBITS_EQ_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
-; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT: uzp1 z1.h, z2.h, z4.h
-; VBITS_EQ_128-NEXT: uzp1 z0.h, z0.h, z3.h
-; VBITS_EQ_128-NEXT: uzp1 z0.b, z0.b, z1.b
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: sdiv_v16i8:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT: sunpkhi z2.h, z1.b
+; VBITS_GE_128-NEXT: sunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: sunpklo z1.h, z1.b
+; VBITS_GE_128-NEXT: sunpkhi z4.s, z2.h
+; VBITS_GE_128-NEXT: sunpkhi z5.s, z3.h
+; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: sunpkhi z3.s, z1.h
+; VBITS_GE_128-NEXT: sunpkhi z5.s, z0.h
+; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT: uzp1 z1.h, z2.h, z4.h
+; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z3.h
+; VBITS_GE_128-NEXT: uzp1 z0.b, z0.b, z1.b
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: sdiv_v16i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h
+; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sdiv_v16i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b
+; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT: ret
%res = sdiv <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sdiv_v32i8:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_256-NEXT: sdiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_256-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_512-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_1024-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_1024-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: sunpklo z1.h, z1.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = sdiv <32 x i8> %op1, %op2
@@ -189,66 +178,22 @@ define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
ret void
}
-define void @sdiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
+define void @sdiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v64i8:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_512-NEXT: sdiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_1024-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_2048-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES2]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: sunpklo z1.h, z1.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = sdiv <64 x i8> %op1, %op2
@@ -256,52 +201,25 @@ define void @sdiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @sdiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @sdiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v128i8:
-
-; FULL VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_1024-NEXT: sdiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_2048-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_2048-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: sunpklo z1.h, z1.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: sunpkhi z2.s, z1.h
+; CHECK-NEXT: sunpkhi z3.s, z0.h
+; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = sdiv <128 x i8> %op1, %op2
@@ -309,35 +227,34 @@ define void @sdiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @sdiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @sdiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v256i8:
-
-; FULL VECTOR:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl256
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_2048-NEXT: sdiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_2048-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: sunpkhi z2.h, z1.b
+; CHECK-NEXT: sunpkhi z3.h, z0.b
+; CHECK-NEXT: sunpklo z1.h, z1.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: sunpkhi z4.s, z2.h
+; CHECK-NEXT: sunpkhi z5.s, z3.h
+; CHECK-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEXT: sunpklo z3.s, z3.h
+; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT: sunpkhi z5.s, z1.h
+; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: sunpkhi z3.s, z0.h
+; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sdiv z3.s, p1/m, z3.s, z5.s
+; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = sdiv <256 x i8> %op1, %op2
@@ -348,84 +265,144 @@ define void @sdiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
-; CHECK-LABEL: sdiv_v4i16:
-; CHECK: sshll v1.4s, v1.4h, #0
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
-; CHECK-NEXT: sshll v0.4s, v0.4h, #0
-; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2:z[0-9]+]].s, [[OP1:z[0-9]+]].s
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov w9, v1.s[2]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.h[1], w8
-; CHECK-NEXT: mov w8, v1.s[3]
-; CHECK-NEXT: mov v0.h[2], w9
-; CHECK-NEXT: mov v0.h[3], w8
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v4i16:
-; VBITS_EQ_128: sshll v1.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: sshll v0.4s, v0.4h, #0
-; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT: xtn v0.4h, v0.4s
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: sdiv_v4i16:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT: xtn v0.4h, v0.4s
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: sdiv_v4i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: sshll v1.4s, v1.4h, #0
+; VBITS_GE_256-NEXT: ptrue p0.s, vl4
+; VBITS_GE_256-NEXT: sshll v0.4s, v0.4h, #0
+; VBITS_GE_256-NEXT: sdivr z1.s, p0/m, z1.s, z0.s
+; VBITS_GE_256-NEXT: mov w8, v1.s[1]
+; VBITS_GE_256-NEXT: mov w9, v1.s[2]
+; VBITS_GE_256-NEXT: mov v0.16b, v1.16b
+; VBITS_GE_256-NEXT: mov v0.h[1], w8
+; VBITS_GE_256-NEXT: mov w8, v1.s[3]
+; VBITS_GE_256-NEXT: mov v0.h[2], w9
+; VBITS_GE_256-NEXT: mov v0.h[3], w8
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sdiv_v4i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: sshll v1.4s, v1.4h, #0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl4
+; VBITS_GE_512-NEXT: sshll v0.4s, v0.4h, #0
+; VBITS_GE_512-NEXT: sdivr z1.s, p0/m, z1.s, z0.s
+; VBITS_GE_512-NEXT: mov w8, v1.s[1]
+; VBITS_GE_512-NEXT: mov w9, v1.s[2]
+; VBITS_GE_512-NEXT: mov v0.16b, v1.16b
+; VBITS_GE_512-NEXT: mov v0.h[1], w8
+; VBITS_GE_512-NEXT: mov w8, v1.s[3]
+; VBITS_GE_512-NEXT: mov v0.h[2], w9
+; VBITS_GE_512-NEXT: mov v0.h[3], w8
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_512-NEXT: ret
%res = sdiv <4 x i16> %op1, %op2
ret <4 x i16> %res
}
define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
-; CHECK-LABEL: sdiv_v8i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; CHECK-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v8i16:
-; VBITS_EQ_128: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: sunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT: sunpkhi z3.s, z0.h
-; VBITS_EQ_128-NEXT: sunpklo z1.s, z1.h
-; VBITS_EQ_128-NEXT: sunpklo z0.s, z0.h
-; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT: uzp1 z0.h, z0.h, z2.h
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: sdiv_v8i16:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h
+; VBITS_GE_128-NEXT: sunpkhi z3.s, z0.h
+; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: sdiv_v8i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sdiv_v8i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT: ret
%res = sdiv <8 x i16> %op1, %op2
ret <8 x i16> %res
}
define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; CHECK-LABEL: sdiv_v16i16:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_256-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_512-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_512-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; VBITS_GE_128-LABEL: sdiv_v16i16:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ldp q0, q1, [x1]
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: sunpkhi z6.s, z0.h
+; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: ldp q3, q2, [x0]
+; VBITS_GE_128-NEXT: sunpkhi z4.s, z1.h
+; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT: sunpkhi z5.s, z2.h
+; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT: sunpkhi z5.s, z3.h
+; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT: sdiv z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT: sdivr z0.s, p0/m, z0.s, z3.s
+; VBITS_GE_128-NEXT: sdivr z1.s, p0/m, z1.s, z2.s
+; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z5.h
+; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z4.h
+; VBITS_GE_128-NEXT: stp q0, q1, [x0]
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: sdiv_v16i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ptrue p1.s, vl8
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h
+; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sdiv_v16i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: ptrue p1.s, vl16
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = sdiv <16 x i16> %op1, %op2
@@ -433,34 +410,19 @@ define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
ret void
}
-define void @sdiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+define void @sdiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sdiv_v32i16:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_512-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_1024-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = sdiv <32 x i16> %op1, %op2
@@ -468,34 +430,19 @@ define void @sdiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @sdiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @sdiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v64i16:
-
-; FULL VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_1024-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_2048-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = sdiv <64 x i16> %op1, %op2
@@ -503,21 +450,22 @@ define void @sdiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @sdiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @sdiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v128i16:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].h, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_2048-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: sunpkhi z2.s, z1.h
+; CHECK-NEXT: sunpkhi z3.s, z0.h
+; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = sdiv <128 x i16> %op1, %op2
@@ -526,45 +474,42 @@ define void @sdiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Vector v2i32 sdiv are not legal for NEON so use SVE when available.
-define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: sdiv_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK: sdiv z0.s, [[PG]]/m, z0.s, z1.s
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v2i32:
-; VBITS_EQ_128: ptrue p0.s, vl2
-; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = sdiv <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Vector v4i32 sdiv are not legal for NEON so use SVE when available.
-define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: sdiv_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK: sdiv z0.s, [[PG]]/m, z0.s, z1.s
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v4i32:
-; VBITS_EQ_128: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = sdiv <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = sdiv <8 x i32> %op1, %op2
@@ -573,13 +518,45 @@ define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @sdiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: sdiv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: sdiv_v16i32:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32]
+; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z4.s
+; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z5.s
+; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT: ldp q6, q4, [x1]
+; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: movprfx z0, z2
+; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z6.s
+; VBITS_GE_128-NEXT: movprfx z1, z3
+; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z4.s
+; VBITS_GE_128-NEXT: stp q0, q1, [x0]
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: sdiv_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: sdiv z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sdiv_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = sdiv <16 x i32> %op1, %op2
@@ -587,14 +564,15 @@ define void @sdiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @sdiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @sdiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sdiv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = sdiv <32 x i32> %op1, %op2
@@ -602,14 +580,15 @@ define void @sdiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @sdiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @sdiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = sdiv <64 x i32> %op1, %op2
@@ -618,45 +597,42 @@ define void @sdiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Vector i64 sdiv are not legal for NEON so use SVE when available.
-define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: sdiv_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK: sdiv z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v1i64:
-; VBITS_EQ_128: ptrue p0.d, vl1
-; VBITS_EQ_128-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = sdiv <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Vector i64 sdiv are not legal for NEON so use SVE when available.
-define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: sdiv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK: sdiv z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v2i64:
-; VBITS_EQ_128: ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = sdiv <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = sdiv <4 x i64> %op1, %op2
@@ -665,13 +641,45 @@ define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @sdiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: sdiv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: sdiv_v8i64:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: ptrue p0.d, vl2
+; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32]
+; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z4.d
+; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z5.d
+; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT: ldp q6, q4, [x1]
+; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: movprfx z0, z2
+; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z6.d
+; VBITS_GE_128-NEXT: movprfx z1, z3
+; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z4.d
+; VBITS_GE_128-NEXT: stp q0, q1, [x0]
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: sdiv_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: sdiv z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: sdiv z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sdiv_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = sdiv <8 x i64> %op1, %op2
@@ -679,14 +687,15 @@ define void @sdiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @sdiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @sdiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sdiv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = sdiv <16 x i64> %op1, %op2
@@ -694,14 +703,15 @@ define void @sdiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @sdiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @sdiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = sdiv <32 x i64> %op1, %op2
@@ -716,164 +726,164 @@ define void @sdiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
; Vector vXi8 udiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
-; CHECK-LABEL: udiv_v8i8:
-; CHECK: ptrue [[PG0:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, z1.b
-; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, z0.b
-; CHECK-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
-; CHECK-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
-; CHECK-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG0]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[DIV]].h, [[DIV]].h
-; CHECK-NEXT: umov [[SCALAR0:w[0-9]+]], [[VEC:v[0-9]+]].h[0]
-; CHECK-NEXT: umov [[SCALAR1:w[0-9]+]], [[VEC]].h[1]
-; CHECK-NEXT: fmov s0, [[SCALAR0]]
-; CHECK-NEXT: umov [[SCALAR2:w[0-9]+]], [[VEC]].h[2]
-; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR1]]
-; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR2]]
-; CHECK-NEXT: umov [[SCALAR3:w[0-9]+]], [[VEC]].h[3]
-; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR3]]
-; CHECK-NEXT: umov [[SCALAR4:w[0-9]+]], [[VEC]].h[4]
-; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR4]]
-; CHECK-NEXT: umov [[SCALAR5:w[0-9]+]], [[VEC]].h[5]
-; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR5]]
-; CHECK-NEXT: umov [[SCALAR6:w[0-9]+]], [[VEC]].h[6]
-; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR6]]
-; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7]
-; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]]
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v8i8:
-; VBITS_EQ_128: ushll v1.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: ushll v0.8h, v0.8b, #0
-; VBITS_EQ_128-NEXT: uunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT: uunpklo z1.s, z1.h
-; VBITS_EQ_128-NEXT: uunpkhi z3.s, z0.h
-; VBITS_EQ_128-NEXT: uunpklo z0.s, z0.h
-; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT: uzp1 z0.h, z0.h, z2.h
-; VBITS_EQ_128-NEXT: xtn v0.8b, v0.8h
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: udiv_v8i8:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0
+; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h
+; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT: uunpkhi z3.s, z0.h
+; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: udiv_v8i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z1.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: umov w8, v1.h[0]
+; VBITS_GE_256-NEXT: umov w9, v1.h[1]
+; VBITS_GE_256-NEXT: fmov s0, w8
+; VBITS_GE_256-NEXT: umov w8, v1.h[2]
+; VBITS_GE_256-NEXT: mov v0.b[1], w9
+; VBITS_GE_256-NEXT: mov v0.b[2], w8
+; VBITS_GE_256-NEXT: umov w8, v1.h[3]
+; VBITS_GE_256-NEXT: mov v0.b[3], w8
+; VBITS_GE_256-NEXT: umov w8, v1.h[4]
+; VBITS_GE_256-NEXT: mov v0.b[4], w8
+; VBITS_GE_256-NEXT: umov w8, v1.h[5]
+; VBITS_GE_256-NEXT: mov v0.b[5], w8
+; VBITS_GE_256-NEXT: umov w8, v1.h[6]
+; VBITS_GE_256-NEXT: mov v0.b[6], w8
+; VBITS_GE_256-NEXT: umov w8, v1.h[7]
+; VBITS_GE_256-NEXT: mov v0.b[7], w8
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: udiv_v8i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: uunpklo z1.h, z1.b
+; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: uzp1 z1.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: umov w8, v1.h[0]
+; VBITS_GE_512-NEXT: umov w9, v1.h[1]
+; VBITS_GE_512-NEXT: fmov s0, w8
+; VBITS_GE_512-NEXT: umov w8, v1.h[2]
+; VBITS_GE_512-NEXT: mov v0.b[1], w9
+; VBITS_GE_512-NEXT: mov v0.b[2], w8
+; VBITS_GE_512-NEXT: umov w8, v1.h[3]
+; VBITS_GE_512-NEXT: mov v0.b[3], w8
+; VBITS_GE_512-NEXT: umov w8, v1.h[4]
+; VBITS_GE_512-NEXT: mov v0.b[4], w8
+; VBITS_GE_512-NEXT: umov w8, v1.h[5]
+; VBITS_GE_512-NEXT: mov v0.b[5], w8
+; VBITS_GE_512-NEXT: umov w8, v1.h[6]
+; VBITS_GE_512-NEXT: mov v0.b[6], w8
+; VBITS_GE_512-NEXT: umov w8, v1.h[7]
+; VBITS_GE_512-NEXT: mov v0.b[7], w8
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_512-NEXT: ret
%res = udiv <8 x i8> %op1, %op2
ret <8 x i8> %res
}
define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
-; CHECK-LABEL: udiv_v16i8:
-
-; HALF VECTOR:
-; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, z1.b
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, z0.b
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, z1.h
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, z0.h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
-; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_256-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, z1.b
-; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, z0.b
-; VBITS_GE_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
-; VBITS_GE_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
-; VBITS_GE_512-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV]].h, [[DIV]].h
-; VBITS_GE_512-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v16i8:
-; VBITS_EQ_128: uunpkhi z2.h, z1.b
-; VBITS_EQ_128-NEXT: uunpkhi z3.h, z0.b
-; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: uunpklo z1.h, z1.b
-; VBITS_EQ_128-NEXT: uunpkhi z4.s, z2.h
-; VBITS_EQ_128-NEXT: uunpkhi z5.s, z3.h
-; VBITS_EQ_128-NEXT: uunpklo z2.s, z2.h
-; VBITS_EQ_128-NEXT: uunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT: uunpklo z0.h, z0.b
-; VBITS_EQ_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
-; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: uunpkhi z3.s, z1.h
-; VBITS_EQ_128-NEXT: uunpkhi z5.s, z0.h
-; VBITS_EQ_128-NEXT: uunpklo z1.s, z1.h
-; VBITS_EQ_128-NEXT: uunpklo z0.s, z0.h
-; VBITS_EQ_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s
-; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT: uzp1 z1.h, z2.h, z4.h
-; VBITS_EQ_128-NEXT: uzp1 z0.h, z0.h, z3.h
-; VBITS_EQ_128-NEXT: uzp1 z0.b, z0.b, z1.b
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: udiv_v16i8:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT: uunpkhi z2.h, z1.b
+; VBITS_GE_128-NEXT: uunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: uunpklo z1.h, z1.b
+; VBITS_GE_128-NEXT: uunpkhi z4.s, z2.h
+; VBITS_GE_128-NEXT: uunpkhi z5.s, z3.h
+; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: uunpkhi z3.s, z1.h
+; VBITS_GE_128-NEXT: uunpkhi z5.s, z0.h
+; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT: uzp1 z1.h, z2.h, z4.h
+; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z3.h
+; VBITS_GE_128-NEXT: uzp1 z0.b, z0.b, z1.b
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: udiv_v16i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h
+; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: udiv_v16i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: uunpklo z1.h, z1.b
+; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT: ret
%res = udiv <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: udiv_v32i8:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_256-NEXT: udiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_256-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_512-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_1024-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_1024-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: uunpklo z1.h, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = udiv <32 x i8> %op1, %op2
@@ -881,66 +891,22 @@ define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
ret void
}
-define void @udiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
+define void @udiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v64i8:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_512-NEXT: udiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_1024-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_2048-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES2]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: uunpklo z1.h, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = udiv <64 x i8> %op1, %op2
@@ -948,52 +914,25 @@ define void @udiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @udiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @udiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v128i8:
-
-; FULL VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_1024-NEXT: udiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_2048-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_2048-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: uunpklo z1.h, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = udiv <128 x i8> %op1, %op2
@@ -1001,33 +940,34 @@ define void @udiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @udiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @udiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v256i8:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl256
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_2048-NEXT: udiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_2048-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: uunpkhi z2.h, z1.b
+; CHECK-NEXT: uunpkhi z3.h, z0.b
+; CHECK-NEXT: uunpklo z1.h, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z4.s, z2.h
+; CHECK-NEXT: uunpkhi z5.s, z3.h
+; CHECK-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEXT: uunpklo z3.s, z3.h
+; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT: uunpkhi z5.s, z1.h
+; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: udiv z3.s, p1/m, z3.s, z5.s
+; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = udiv <256 x i8> %op1, %op2
@@ -1038,84 +978,144 @@ define void @udiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
; Vector vXi16 udiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
-; CHECK-LABEL: udiv_v4i16:
-; CHECK: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2:z[0-9]+]].s, [[OP1:z[0-9]+]].s
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov w9, v1.s[2]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.h[1], w8
-; CHECK-NEXT: mov w8, v1.s[3]
-; CHECK-NEXT: mov v0.h[2], w9
-; CHECK-NEXT: mov v0.h[3], w8
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v4i16:
-; VBITS_EQ_128: ushll v1.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: ushll v0.4s, v0.4h, #0
-; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT: xtn v0.4h, v0.4s
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: udiv_v4i16:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT: xtn v0.4h, v0.4s
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: udiv_v4i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ushll v1.4s, v1.4h, #0
+; VBITS_GE_256-NEXT: ptrue p0.s, vl4
+; VBITS_GE_256-NEXT: ushll v0.4s, v0.4h, #0
+; VBITS_GE_256-NEXT: udivr z1.s, p0/m, z1.s, z0.s
+; VBITS_GE_256-NEXT: mov w8, v1.s[1]
+; VBITS_GE_256-NEXT: mov w9, v1.s[2]
+; VBITS_GE_256-NEXT: mov v0.16b, v1.16b
+; VBITS_GE_256-NEXT: mov v0.h[1], w8
+; VBITS_GE_256-NEXT: mov w8, v1.s[3]
+; VBITS_GE_256-NEXT: mov v0.h[2], w9
+; VBITS_GE_256-NEXT: mov v0.h[3], w8
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: udiv_v4i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ushll v1.4s, v1.4h, #0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl4
+; VBITS_GE_512-NEXT: ushll v0.4s, v0.4h, #0
+; VBITS_GE_512-NEXT: udivr z1.s, p0/m, z1.s, z0.s
+; VBITS_GE_512-NEXT: mov w8, v1.s[1]
+; VBITS_GE_512-NEXT: mov w9, v1.s[2]
+; VBITS_GE_512-NEXT: mov v0.16b, v1.16b
+; VBITS_GE_512-NEXT: mov v0.h[1], w8
+; VBITS_GE_512-NEXT: mov w8, v1.s[3]
+; VBITS_GE_512-NEXT: mov v0.h[2], w9
+; VBITS_GE_512-NEXT: mov v0.h[3], w8
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_512-NEXT: ret
%res = udiv <4 x i16> %op1, %op2
ret <4 x i16> %res
}
define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
-; CHECK-LABEL: udiv_v8i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; CHECK-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v8i16:
-; VBITS_EQ_128: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: uunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT: uunpkhi z3.s, z0.h
-; VBITS_EQ_128-NEXT: uunpklo z1.s, z1.h
-; VBITS_EQ_128-NEXT: uunpklo z0.s, z0.h
-; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT: uzp1 z0.h, z0.h, z2.h
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: udiv_v8i16:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h
+; VBITS_GE_128-NEXT: uunpkhi z3.s, z0.h
+; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: udiv_v8i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: udiv_v8i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT: ret
%res = udiv <8 x i16> %op1, %op2
ret <8 x i16> %res
}
define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; CHECK-LABEL: udiv_v16i16:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_256-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_512-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_512-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; VBITS_GE_128-LABEL: udiv_v16i16:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ldp q0, q1, [x1]
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: uunpkhi z6.s, z0.h
+; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: ldp q3, q2, [x0]
+; VBITS_GE_128-NEXT: uunpkhi z4.s, z1.h
+; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT: uunpkhi z5.s, z2.h
+; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT: uunpkhi z5.s, z3.h
+; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT: udiv z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT: udivr z0.s, p0/m, z0.s, z3.s
+; VBITS_GE_128-NEXT: udivr z1.s, p0/m, z1.s, z2.s
+; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z5.h
+; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z4.h
+; VBITS_GE_128-NEXT: stp q0, q1, [x0]
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: udiv_v16i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ptrue p1.s, vl8
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h
+; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: udiv z0.s, p1/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: udiv_v16i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: ptrue p1.s, vl16
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: udiv z0.s, p1/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = udiv <16 x i16> %op1, %op2
@@ -1123,34 +1123,19 @@ define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
ret void
}
-define void @udiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+define void @udiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: udiv_v32i16:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_512-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_1024-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = udiv <32 x i16> %op1, %op2
@@ -1158,34 +1143,19 @@ define void @udiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @udiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @udiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v64i16:
-
-; FULL VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_1024-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_2048-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = udiv <64 x i16> %op1, %op2
@@ -1193,21 +1163,22 @@ define void @udiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @udiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @udiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v128i16:
-; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: udivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_2048-NEXT: udiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
-; VBITS_EQ_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_EQ_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = udiv <128 x i16> %op1, %op2
@@ -1216,46 +1187,42 @@ define void @udiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Vector v2i32 udiv are not legal for NEON so use SVE when available.
-define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: udiv_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK: udiv z0.s, [[PG]]/m, z0.s, z1.s
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v2i32:
-; VBITS_EQ_128: ptrue p0.s, vl2
-; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = udiv <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Vector v4i32 udiv are not legal for NEON so use SVE when available.
-define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: udiv_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK: udiv z0.s, [[PG]]/m, z0.s, z1.s
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v4i32:
-; VBITS_EQ_128: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = udiv <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: udiv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = udiv <8 x i32> %op1, %op2
@@ -1264,13 +1231,45 @@ define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @udiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: udiv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: udiv_v16i32:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32]
+; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z4.s
+; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z5.s
+; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT: ldp q6, q4, [x1]
+; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: movprfx z0, z2
+; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z6.s
+; VBITS_GE_128-NEXT: movprfx z1, z3
+; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z4.s
+; VBITS_GE_128-NEXT: stp q0, q1, [x0]
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: udiv_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: udiv z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: udiv_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = udiv <16 x i32> %op1, %op2
@@ -1278,14 +1277,15 @@ define void @udiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @udiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @udiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: udiv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = udiv <32 x i32> %op1, %op2
@@ -1293,14 +1293,15 @@ define void @udiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @udiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @udiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = udiv <64 x i32> %op1, %op2
@@ -1309,45 +1310,42 @@ define void @udiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Vector i64 udiv are not legal for NEON so use SVE when available.
-define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: udiv_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK: udiv z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v1i64:
-; VBITS_EQ_128: ptrue p0.d, vl1
-; VBITS_EQ_128-NEXT: udiv z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = udiv <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Vector i64 udiv are not legal for NEON so use SVE when available.
-define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: udiv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK: udiv z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v2i64:
-; VBITS_EQ_128: ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT: udiv z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = udiv <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: udiv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = udiv <4 x i64> %op1, %op2
@@ -1356,13 +1354,45 @@ define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @udiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: udiv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: udiv_v8i64:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: ptrue p0.d, vl2
+; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32]
+; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z4.d
+; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z5.d
+; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT: ldp q6, q4, [x1]
+; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: movprfx z0, z2
+; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z6.d
+; VBITS_GE_128-NEXT: movprfx z1, z3
+; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z4.d
+; VBITS_GE_128-NEXT: stp q0, q1, [x0]
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: udiv_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: udiv z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: udiv z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: udiv_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: udiv z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = udiv <8 x i64> %op1, %op2
@@ -1370,14 +1400,15 @@ define void @udiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @udiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @udiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: udiv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = udiv <16 x i64> %op1, %op2
@@ -1385,14 +1416,15 @@ define void @udiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @udiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @udiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = udiv <32 x i64> %op1, %op2
@@ -1402,14 +1434,15 @@ define void @udiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
; This used to crash because isUnaryPredicate and BuildUDIV don't know how
; a SPLAT_VECTOR of fixed vector type should be handled.
-define void @udiv_constantsplat_v8i32(<8 x i32>* %a) #1 {
+define void @udiv_constantsplat_v8i32(<8 x i32>* %a) vscale_range(2,0) #1 {
; CHECK-LABEL: udiv_constantsplat_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: mov [[OP2:z[0-9]+]].s, #95
-; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: mov z1.s, #95 // =0x5f
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
store <8 x i32> %res, <8 x i32>* %a
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
index bfe4b47242c3..b9f5c12c331a 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
@@ -1,25 +1,10 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: z{0-9}
-
;
; sext i1 -> i32
;
@@ -27,15 +12,17 @@ target triple = "aarch64-unknown-linux-gnu"
; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg
; type's element type is not byte based and thus cannot be lowered directly to
; an SVE instruction.
-define void @sext_v8i1_v8i32(<8 x i1> %a, <8 x i32>* %out) #0 {
+define void @sext_v8i1_v8i32(<8 x i1> %a, <8 x i32>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: sext_v8i1_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; CHECK-NEXT: lsl [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, #31
-; CHECK-NEXT: asr [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, #31
-; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: lsl z0.s, p0/m, z0.s, #31
+; CHECK-NEXT: asr z0.s, p0/m, z0.s, #31
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%b = sext <8 x i1> %a to <8 x i32>
store <8 x i32> %b, <8 x i32>* %out
ret void
@@ -48,15 +35,17 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, <8 x i32>* %out) #0 {
; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg
; type's element type is not power-of-2 based and thus cannot be lowered
; directly to an SVE instruction.
-define void @sext_v4i3_v4i64(<4 x i3> %a, <4 x i64>* %out) #0 {
+define void @sext_v4i3_v4i64(<4 x i3> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: sext_v4i3_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; CHECK-NEXT: lsl [[A_DWORDS]].d, [[PG]]/m, [[A_DWORDS]].d, #61
-; CHECK-NEXT: asr [[A_DWORDS]].d, [[PG]]/m, [[A_DWORDS]].d, #61
-; CHECK-NEXT: st1d { [[A_WORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: lsl z0.d, p0/m, z0.d, #61
+; CHECK-NEXT: asr z0.d, p0/m, z0.d, #61
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%b = sext <4 x i3> %a to <4 x i64>
store <4 x i64> %b, <4 x i64>* %out
ret void
@@ -66,12 +55,14 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, <4 x i64>* %out) #0 {
; sext i8 -> i16
;
-define void @sext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 {
+define void @sext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: sext_v16i8_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; CHECK-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%b = sext <16 x i8> %a to <16 x i16>
store <16 x i16>%b, <16 x i16>* %out
ret void
@@ -79,12 +70,29 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 {
; NOTE: Extra 'add' is to prevent the extend being combined with the load.
define void @sext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 {
-; CHECK-LABEL: sext_v32i8_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: sext_v32i8_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sext_v32i8_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl32
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <32 x i8>, <32 x i8>* %in
%b = add <32 x i8> %a, %a
%c = sext <32 x i8> %b to <32 x i16>
@@ -92,13 +100,16 @@ define void @sext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 {
ret void
}
-define void @sext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 {
+define void @sext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: sext_v64i8_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x i8>, <64 x i8>* %in
%b = add <64 x i8> %a, %a
%c = sext <64 x i8> %b to <64 x i16>
@@ -106,13 +117,16 @@ define void @sext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 {
ret void
}
-define void @sext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 {
+define void @sext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: sext_v128i8_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <128 x i8>, <128 x i8>* %in
%b = add <128 x i8> %a, %a
%c = sext <128 x i8> %b to <128 x i16>
@@ -124,50 +138,59 @@ define void @sext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 {
; sext i8 -> i32
;
-define void @sext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) #0 {
+define void @sext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: sext_v8i8_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; CHECK-NEXT: st1w { [[A_HALFS]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%b = sext <8 x i8> %a to <8 x i32>
store <8 x i32>%b, <8 x i32>* %out
ret void
}
define void @sext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 {
-; CHECK-LABEL: sext_v16i8_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ext v[[A_HI:[0-9]+]].16b, v0.16b, v0.16b, #8
-; VBITS_EQ_256-DAG: sunpklo [[A_HALFS_LO:z[0-9]+]].h, z0.b
-; VBITS_EQ_256-DAG: sunpklo [[A_HALFS_HI:z[0-9]+]].h, z[[A_HI]].b
-; VBITS_EQ_256-DAG: sunpklo [[A_WORDS_LO:z[0-9]+]].s, [[A_HALFS_LO]].h
-; VBITS_EQ_256-DAG: sunpklo [[A_WORDS_HI:z[0-9]+]].s, [[A_HALFS_HI]].h
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: st1w { [[A_WORDS_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[A_WORDS_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: sext_v16i8_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sext_v16i8_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%b = sext <16 x i8> %a to <16 x i32>
store <16 x i32> %b, <16 x i32>* %out
ret void
}
-define void @sext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 {
+define void @sext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: sext_v32i8_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i8>, <32 x i8>* %in
%b = add <32 x i8> %a, %a
%c = sext <32 x i8> %b to <32 x i32>
@@ -175,14 +198,17 @@ define void @sext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 {
ret void
}
-define void @sext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 {
+define void @sext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: sext_v64i8_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x i8>, <64 x i8>* %in
%b = add <64 x i8> %a, %a
%c = sext <64 x i8> %b to <64 x i32>
@@ -197,54 +223,77 @@ define void @sext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 {
; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The sign
; extend is a two step process where the container is any_extend'd with the
; result feeding an inreg sign extend.
-define void @sext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) #0 {
+define void @sext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: sext_v4i8_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[ANYEXT_W:z[0-9]+]].s, z0.h
-; CHECK-NEXT: uunpklo [[ANYEXT_D:z[0-9]+]].d, [[ANYEXT_W]].s
-; CHECK-NEXT: sxtb [[A_DWORDS:z[0-9]+]].d, [[PG]]/m, [[ANYEXT_D]].d
-; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%b = sext <4 x i8> %a to <4 x i64>
store <4 x i64>%b, <4 x i64>* %out
ret void
}
define void @sext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 {
-; CHECK-LABEL: sext_v8i8_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: sext_v8i8_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sext_v8i8_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%b = sext <8 x i8> %a to <8 x i64>
store <8 x i64>%b, <8 x i64>* %out
ret void
}
-define void @sext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 {
+define void @sext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: sext_v16i8_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%b = sext <16 x i8> %a to <16 x i64>
store <16 x i64> %b, <16 x i64>* %out
ret void
}
-define void @sext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 {
+define void @sext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: sext_v32i8_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i8>, <32 x i8>* %in
%b = add <32 x i8> %a, %a
%c = sext <32 x i8> %b to <32 x i64>
@@ -256,24 +305,43 @@ define void @sext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 {
; sext i16 -> i32
;
-define void @sext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) #0 {
+define void @sext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: sext_v8i16_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%b = sext <8 x i16> %a to <8 x i32>
store <8 x i32>%b, <8 x i32>* %out
ret void
}
define void @sext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 {
-; CHECK-LABEL: sext_v16i16_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: sext_v16i16_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sext_v16i16_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %in
%b = add <16 x i16> %a, %a
%c = sext <16 x i16> %b to <16 x i32>
@@ -281,13 +349,16 @@ define void @sext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 {
ret void
}
-define void @sext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 {
+define void @sext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: sext_v32i16_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: add z0.h, z0.h, z0.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %in
%b = add <32 x i16> %a, %a
%c = sext <32 x i16> %b to <32 x i32>
@@ -295,13 +366,16 @@ define void @sext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 {
ret void
}
-define void @sext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 {
+define void @sext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: sext_v64i16_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: add z0.h, z0.h, z0.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x i16>, <64 x i16>* %in
%b = add <64 x i16> %a, %a
%c = sext <64 x i16> %b to <64 x i32>
@@ -313,38 +387,59 @@ define void @sext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 {
; sext i16 -> i64
;
-define void @sext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) #0 {
+define void @sext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: sext_v4i16_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; CHECK-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%b = sext <4 x i16> %a to <4 x i64>
store <4 x i64>%b, <4 x i64>* %out
ret void
}
define void @sext_v8i16_v8i64(<8 x i16> %a, <8 x i64>* %out) #0 {
-; CHECK-LABEL: sext_v8i16_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: sext_v8i16_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sext_v8i16_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%b = sext <8 x i16> %a to <8 x i64>
store <8 x i64>%b, <8 x i64>* %out
ret void
}
-define void @sext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 {
+define void @sext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: sext_v16i16_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: add z0.h, z0.h, z0.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %in
%b = add <16 x i16> %a, %a
%c = sext <16 x i16> %b to <16 x i64>
@@ -352,14 +447,17 @@ define void @sext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 {
ret void
}
-define void @sext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 {
+define void @sext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: sext_v32i16_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: add z0.h, z0.h, z0.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %in
%b = add <32 x i16> %a, %a
%c = sext <32 x i16> %b to <32 x i64>
@@ -371,24 +469,43 @@ define void @sext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 {
; sext i32 -> i64
;
-define void @sext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) #0 {
+define void @sext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: sext_v4i32_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, z0.s
-; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%b = sext <4 x i32> %a to <4 x i64>
store <4 x i64>%b, <4 x i64>* %out
ret void
}
define void @sext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 {
-; CHECK-LABEL: sext_v8i32_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
-; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: sext_v8i32_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sext_v8i32_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %in
%b = add <8 x i32> %a, %a
%c = sext <8 x i32> %b to <8 x i64>
@@ -396,13 +513,16 @@ define void @sext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 {
ret void
}
-define void @sext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 {
+define void @sext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: sext_v16i32_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
-; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: add z0.s, z0.s, z0.s
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %in
%b = add <16 x i32> %a, %a
%c = sext <16 x i32> %b to <16 x i64>
@@ -410,13 +530,16 @@ define void @sext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 {
ret void
}
-define void @sext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 {
+define void @sext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: sext_v32i32_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
-; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: add z0.s, z0.s, z0.s
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %in
%b = add <32 x i32> %a, %a
%c = sext <32 x i32> %b to <32 x i64>
@@ -428,12 +551,14 @@ define void @sext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 {
; zext i8 -> i16
;
-define void @zext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 {
+define void @zext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: zext_v16i8_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; CHECK-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%b = zext <16 x i8> %a to <16 x i16>
store <16 x i16>%b, <16 x i16>* %out
ret void
@@ -441,12 +566,29 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 {
; NOTE: Extra 'add' is to prevent the extend being combined with the load.
define void @zext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 {
-; CHECK-LABEL: zext_v32i8_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: zext_v32i8_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: zext_v32i8_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl32
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <32 x i8>, <32 x i8>* %in
%b = add <32 x i8> %a, %a
%c = zext <32 x i8> %b to <32 x i16>
@@ -454,13 +596,16 @@ define void @zext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 {
ret void
}
-define void @zext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 {
+define void @zext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: zext_v64i8_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x i8>, <64 x i8>* %in
%b = add <64 x i8> %a, %a
%c = zext <64 x i8> %b to <64 x i16>
@@ -468,13 +613,16 @@ define void @zext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 {
ret void
}
-define void @zext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 {
+define void @zext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: zext_v128i8_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <128 x i8>, <128 x i8>* %in
%b = add <128 x i8> %a, %a
%c = zext <128 x i8> %b to <128 x i16>
@@ -486,50 +634,59 @@ define void @zext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 {
; zext i8 -> i32
;
-define void @zext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) #0 {
+define void @zext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: zext_v8i8_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; CHECK-NEXT: st1w { [[A_HALFS]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%b = zext <8 x i8> %a to <8 x i32>
store <8 x i32>%b, <8 x i32>* %out
ret void
}
define void @zext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 {
-; CHECK-LABEL: zext_v16i8_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ext v[[A_HI:[0-9]+]].16b, v0.16b, v0.16b, #8
-; VBITS_EQ_256-DAG: uunpklo [[A_HALFS_LO:z[0-9]+]].h, z0.b
-; VBITS_EQ_256-DAG: uunpklo [[A_HALFS_HI:z[0-9]+]].h, z[[A_HI]].b
-; VBITS_EQ_256-DAG: uunpklo [[A_WORDS_LO:z[0-9]+]].s, [[A_HALFS_LO]].h
-; VBITS_EQ_256-DAG: uunpklo [[A_WORDS_HI:z[0-9]+]].s, [[A_HALFS_HI]].h
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[OUT_HI:[0-9]+]], #8
-; VBITS_EQ_256-DAG: st1w { [[A_WORDS_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[A_WORDS_HI]].s }, [[PG]], [x0, x[[OUT_HI]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: zext_v16i8_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: zext_v16i8_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%b = zext <16 x i8> %a to <16 x i32>
store <16 x i32> %b, <16 x i32>* %out
ret void
}
-define void @zext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 {
+define void @zext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: zext_v32i8_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i8>, <32 x i8>* %in
%b = add <32 x i8> %a, %a
%c = zext <32 x i8> %b to <32 x i32>
@@ -537,14 +694,17 @@ define void @zext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 {
ret void
}
-define void @zext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 {
+define void @zext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: zext_v64i8_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x i8>, <64 x i8>* %in
%b = add <64 x i8> %a, %a
%c = zext <64 x i8> %b to <64 x i32>
@@ -559,54 +719,77 @@ define void @zext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 {
; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The zero
; extend is a two step process where the container is zero_extend_inreg'd with
; the result feeding a normal zero extend from halfs to doublewords.
-define void @zext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) #0 {
+define void @zext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: zext_v4i8_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: bic v0.4h, #255, lsl #8
-; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: bic v0.4h, #255, lsl #8
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%b = zext <4 x i8> %a to <4 x i64>
store <4 x i64>%b, <4 x i64>* %out
ret void
}
define void @zext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 {
-; CHECK-LABEL: zext_v8i8_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: zext_v8i8_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: zext_v8i8_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%b = zext <8 x i8> %a to <8 x i64>
store <8 x i64>%b, <8 x i64>* %out
ret void
}
-define void @zext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 {
+define void @zext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: zext_v16i8_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%b = zext <16 x i8> %a to <16 x i64>
store <16 x i64> %b, <16 x i64>* %out
ret void
}
-define void @zext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 {
+define void @zext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: zext_v32i8_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i8>, <32 x i8>* %in
%b = add <32 x i8> %a, %a
%c = zext <32 x i8> %b to <32 x i64>
@@ -618,24 +801,43 @@ define void @zext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 {
; zext i16 -> i32
;
-define void @zext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) #0 {
+define void @zext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: zext_v8i16_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%b = zext <8 x i16> %a to <8 x i32>
store <8 x i32>%b, <8 x i32>* %out
ret void
}
define void @zext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 {
-; CHECK-LABEL: zext_v16i16_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: zext_v16i16_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: zext_v16i16_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %in
%b = add <16 x i16> %a, %a
%c = zext <16 x i16> %b to <16 x i32>
@@ -643,13 +845,16 @@ define void @zext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 {
ret void
}
-define void @zext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 {
+define void @zext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: zext_v32i16_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: add z0.h, z0.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %in
%b = add <32 x i16> %a, %a
%c = zext <32 x i16> %b to <32 x i32>
@@ -657,13 +862,16 @@ define void @zext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 {
ret void
}
-define void @zext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 {
+define void @zext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: zext_v64i16_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: add z0.h, z0.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x i16>, <64 x i16>* %in
%b = add <64 x i16> %a, %a
%c = zext <64 x i16> %b to <64 x i32>
@@ -675,38 +883,59 @@ define void @zext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 {
; zext i16 -> i64
;
-define void @zext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) #0 {
+define void @zext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: zext_v4i16_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%b = zext <4 x i16> %a to <4 x i64>
store <4 x i64>%b, <4 x i64>* %out
ret void
}
define void @zext_v8i16_v8i64(<8 x i16> %a, <8 x i64>* %out) #0 {
-; CHECK-LABEL: zext_v8i16_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: zext_v8i16_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: zext_v8i16_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%b = zext <8 x i16> %a to <8 x i64>
store <8 x i64>%b, <8 x i64>* %out
ret void
}
-define void @zext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 {
+define void @zext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: zext_v16i16_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: add z0.h, z0.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %in
%b = add <16 x i16> %a, %a
%c = zext <16 x i16> %b to <16 x i64>
@@ -714,14 +943,17 @@ define void @zext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 {
ret void
}
-define void @zext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 {
+define void @zext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: zext_v32i16_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: add z0.h, z0.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %in
%b = add <32 x i16> %a, %a
%c = zext <32 x i16> %b to <32 x i64>
@@ -733,24 +965,43 @@ define void @zext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 {
; zext i32 -> i64
;
-define void @zext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) #0 {
+define void @zext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: zext_v4i32_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, z0.s
-; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%b = zext <4 x i32> %a to <4 x i64>
store <4 x i64>%b, <4 x i64>* %out
ret void
}
define void @zext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 {
-; CHECK-LABEL: zext_v8i32_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
-; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: zext_v8i32_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: zext_v8i32_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %in
%b = add <8 x i32> %a, %a
%c = zext <8 x i32> %b to <8 x i64>
@@ -758,13 +1009,16 @@ define void @zext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 {
ret void
}
-define void @zext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 {
+define void @zext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: zext_v16i32_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
-; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: add z0.s, z0.s, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %in
%b = add <16 x i32> %a, %a
%c = zext <16 x i32> %b to <16 x i64>
@@ -772,13 +1026,16 @@ define void @zext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 {
ret void
}
-define void @zext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 {
+define void @zext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: zext_v32i32_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
-; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: add z0.s, z0.s, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %in
%b = add <32 x i32> %a, %a
%c = zext <32 x i32> %b to <32 x i64>
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll
index 0833ed53a932..9e9ce74f7b1b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll
@@ -1,60 +1,43 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
-
-; VBYTES represents the useful byte size of a vector register from the code
-; generator's point of view. It is clamped to power-of-2 values because
-; only power-of-2 vector lengths are considered legal, regardless of the
-; user specified vector length.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; AND
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: and_v8i8:
-; CHECK: and v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = and <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: and_v16i8:
-; CHECK: and v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = and <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @and_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @and_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: and_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = and <32 x i8> %op1, %op2
@@ -63,18 +46,28 @@ define void @and_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @and_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: and_v64i8:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-DAG: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_LE_256-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
-; VBITS_LE_256-DAG: and [[RES_1:z[0-9]+]].d, [[OP1_1]].d, [[OP2_1]].d
-; VBITS_LE_256-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
-; CHECK: ret
+; VBITS_GE_256-LABEL: and_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: and z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: and z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: and_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = and <64 x i8> %op1, %op2
@@ -82,29 +75,15 @@ define void @and_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @and_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @and_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: and_v128i8:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-DAG: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_LE_512-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
-; VBITS_LE_512-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
-; VBITS_LE_512-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
-; VBITS_LE_512-DAG: and [[RES_1:z[0-9]+]].d, [[OP1_1]].d, [[OP2_1]].d
-; VBITS_LE_512-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
-; VBITS_LE_256-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]]
-; VBITS_LE_256-DAG: and [[RES_2:z[0-9]+]].d, [[OP1_2]].d, [[OP2_2]].d
-; VBITS_LE_256-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]]
-; VBITS_LE_256-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]]
-; VBITS_LE_256-DAG: and [[RES_3:z[0-9]+]].d, [[OP1_3]].d, [[OP2_3]].d
-; VBITS_LE_256-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = and <128 x i8> %op1, %op2
@@ -112,49 +91,15 @@ define void @and_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @and_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @and_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: and_v256i8:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-DAG: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_LE_1024-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
-; VBITS_LE_1024-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
-; VBITS_LE_1024-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
-; VBITS_LE_1024-DAG: and [[RES_1:z[0-9]+]].d, [[OP1_1]].d, [[OP2_1]].d
-; VBITS_LE_1024-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
-; VBITS_LE_512-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]]
-; VBITS_LE_512-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]]
-; VBITS_LE_512-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]]
-; VBITS_LE_512-DAG: and [[RES_2:z[0-9]+]].d, [[OP1_2]].d, [[OP2_2]].d
-; VBITS_LE_512-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]]
-; VBITS_LE_512-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]]
-; VBITS_LE_512-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]]
-; VBITS_LE_512-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]]
-; VBITS_LE_512-DAG: and [[RES_3:z[0-9]+]].d, [[OP1_3]].d, [[OP2_3]].d
-; VBITS_LE_512-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]]
-; VBITS_LE_256-DAG: mov w[[OFF_4:[0-9]+]], #[[#mul(VBYTES,4)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_4:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_4]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_4:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_4]]]
-; VBITS_LE_256-DAG: and [[RES_4:z[0-9]+]].d, [[OP1_4]].d, [[OP2_4]].d
-; VBITS_LE_256-DAG: st1b { [[RES_4]].b }, [[PG]], [x0, x[[OFF_4]]]
-; VBITS_LE_256-DAG: mov w[[OFF_5:[0-9]+]], #[[#mul(VBYTES,5)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_5:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_5]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_5:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_5]]]
-; VBITS_LE_256-DAG: and [[RES_5:z[0-9]+]].d, [[OP1_5]].d, [[OP2_5]].d
-; VBITS_LE_256-DAG: st1b { [[RES_5]].b }, [[PG]], [x0, x[[OFF_5]]]
-; VBITS_LE_256-DAG: mov w[[OFF_6:[0-9]+]], #[[#mul(VBYTES,6)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_6:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_6]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_6:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_6]]]
-; VBITS_LE_256-DAG: and [[RES_6:z[0-9]+]].d, [[OP1_6]].d, [[OP2_6]].d
-; VBITS_LE_256-DAG: st1b { [[RES_6]].b }, [[PG]], [x0, x[[OFF_6]]]
-; VBITS_LE_256-DAG: mov w[[OFF_7:[0-9]+]], #[[#mul(VBYTES,7)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_7:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_7]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_7:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_7]]]
-; VBITS_LE_256-DAG: and [[RES_7:z[0-9]+]].d, [[OP1_7]].d, [[OP2_7]].d
-; VBITS_LE_256-DAG: st1b { [[RES_7]].b }, [[PG]], [x0, x[[OFF_7]]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = and <256 x i8> %op1, %op2
@@ -163,31 +108,34 @@ define void @and_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: and_v4i16:
-; CHECK: and v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = and <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: and_v8i16:
-; CHECK: and v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = and <8 x i16> %op1, %op2
ret <8 x i16> %res
}
-define void @and_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @and_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: and_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = and <16 x i16> %op1, %op2
@@ -195,16 +143,29 @@ define void @and_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
define void @and_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: and_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: and_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: and z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: and z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: and_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = and <32 x i16> %op1, %op2
@@ -212,16 +173,15 @@ define void @and_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
-define void @and_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @and_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: and_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = and <64 x i16> %op1, %op2
@@ -229,16 +189,15 @@ define void @and_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
-define void @and_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @and_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: and_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = and <128 x i16> %op1, %op2
@@ -247,31 +206,34 @@ define void @and_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: and_v2i32:
-; CHECK: and v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = and <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: and_v4i32:
-; CHECK: and v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = and <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @and_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @and_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: and_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = and <8 x i32> %op1, %op2
@@ -279,16 +241,29 @@ define void @and_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
define void @and_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: and_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: and_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: and z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: and z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: and_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = and <16 x i32> %op1, %op2
@@ -296,16 +271,15 @@ define void @and_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
-define void @and_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @and_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: and_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = and <32 x i32> %op1, %op2
@@ -313,16 +287,15 @@ define void @and_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
-define void @and_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @and_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: and_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = and <64 x i32> %op1, %op2
@@ -331,31 +304,34 @@ define void @and_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: and_v1i64:
-; CHECK: and v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = and <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: and_v2i64:
-; CHECK: and v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = and <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @and_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @and_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: and_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = and <4 x i64> %op1, %op2
@@ -363,16 +339,29 @@ define void @and_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
define void @and_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: and_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: and_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: and z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: and z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: and_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = and <8 x i64> %op1, %op2
@@ -380,16 +369,15 @@ define void @and_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
-define void @and_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @and_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: and_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = and <16 x i64> %op1, %op2
@@ -397,16 +385,15 @@ define void @and_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
-define void @and_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @and_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: and_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = and <32 x i64> %op1, %op2
@@ -414,41 +401,39 @@ define void @and_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
ret void
}
-;
-; NOTE: Tests beyond this point only have CHECK lines to validate the first
-; VBYTES because the and tests already validate the legalisation code paths.
-;
-
;
; OR
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: or_v8i8:
-; CHECK: orr v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = or <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: or_v16i8:
-; CHECK: orr v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = or <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @or_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @or_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: or_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = or <32 x i8> %op1, %op2
@@ -457,13 +442,28 @@ define void @or_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @or_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: or_v64i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: or_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: orr z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: orr z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: or_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = or <64 x i8> %op1, %op2
@@ -471,14 +471,15 @@ define void @or_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @or_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @or_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: or_v128i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = or <128 x i8> %op1, %op2
@@ -486,14 +487,15 @@ define void @or_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @or_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @or_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: or_v256i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = or <256 x i8> %op1, %op2
@@ -502,31 +504,34 @@ define void @or_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: or_v4i16:
-; CHECK: orr v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = or <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: or_v8i16:
-; CHECK: orr v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = or <8 x i16> %op1, %op2
ret <8 x i16> %res
}
-define void @or_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @or_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: or_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = or <16 x i16> %op1, %op2
@@ -535,13 +540,28 @@ define void @or_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @or_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: or_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: or_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: orr z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: orr z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: or_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = or <32 x i16> %op1, %op2
@@ -549,14 +569,15 @@ define void @or_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @or_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @or_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: or_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = or <64 x i16> %op1, %op2
@@ -564,14 +585,15 @@ define void @or_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @or_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @or_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: or_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = or <128 x i16> %op1, %op2
@@ -580,31 +602,34 @@ define void @or_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: or_v2i32:
-; CHECK: orr v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = or <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: or_v4i32:
-; CHECK: orr v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = or <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @or_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @or_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: or_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = or <8 x i32> %op1, %op2
@@ -613,13 +638,28 @@ define void @or_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @or_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: or_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: or_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: orr z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: orr z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: or_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = or <16 x i32> %op1, %op2
@@ -627,14 +667,15 @@ define void @or_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @or_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @or_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: or_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = or <32 x i32> %op1, %op2
@@ -642,14 +683,15 @@ define void @or_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @or_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @or_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: or_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = or <64 x i32> %op1, %op2
@@ -658,31 +700,34 @@ define void @or_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: or_v1i64:
-; CHECK: orr v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = or <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: or_v2i64:
-; CHECK: orr v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = or <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @or_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @or_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: or_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = or <4 x i64> %op1, %op2
@@ -691,13 +736,28 @@ define void @or_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @or_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: or_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: or_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: orr z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: orr z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: or_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = or <8 x i64> %op1, %op2
@@ -705,14 +765,15 @@ define void @or_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @or_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @or_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: or_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = or <16 x i64> %op1, %op2
@@ -720,14 +781,15 @@ define void @or_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @or_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @or_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: or_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = or <32 x i64> %op1, %op2
@@ -740,31 +802,34 @@ define void @or_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: xor_v8i8:
-; CHECK: eor v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = xor <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: xor_v16i8:
-; CHECK: eor v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = xor <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @xor_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @xor_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: xor_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = xor <32 x i8> %op1, %op2
@@ -773,13 +838,28 @@ define void @xor_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @xor_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: xor_v64i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: xor_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: eor z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: eor z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: xor_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = xor <64 x i8> %op1, %op2
@@ -787,14 +867,15 @@ define void @xor_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @xor_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @xor_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: xor_v128i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = xor <128 x i8> %op1, %op2
@@ -802,14 +883,15 @@ define void @xor_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @xor_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @xor_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: xor_v256i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = xor <256 x i8> %op1, %op2
@@ -818,31 +900,34 @@ define void @xor_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: xor_v4i16:
-; CHECK: eor v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = xor <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: xor_v8i16:
-; CHECK: eor v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = xor <8 x i16> %op1, %op2
ret <8 x i16> %res
}
-define void @xor_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @xor_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: xor_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = xor <16 x i16> %op1, %op2
@@ -851,13 +936,28 @@ define void @xor_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @xor_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: xor_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: xor_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: eor z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: eor z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: xor_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = xor <32 x i16> %op1, %op2
@@ -865,14 +965,15 @@ define void @xor_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @xor_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @xor_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: xor_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = xor <64 x i16> %op1, %op2
@@ -880,14 +981,15 @@ define void @xor_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @xor_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @xor_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: xor_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = xor <128 x i16> %op1, %op2
@@ -896,31 +998,34 @@ define void @xor_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: xor_v2i32:
-; CHECK: eor v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = xor <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: xor_v4i32:
-; CHECK: eor v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = xor <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @xor_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @xor_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: xor_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = xor <8 x i32> %op1, %op2
@@ -929,13 +1034,28 @@ define void @xor_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @xor_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: xor_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: xor_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: eor z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: eor z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: xor_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = xor <16 x i32> %op1, %op2
@@ -943,14 +1063,15 @@ define void @xor_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @xor_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @xor_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: xor_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = xor <32 x i32> %op1, %op2
@@ -958,14 +1079,15 @@ define void @xor_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @xor_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @xor_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: xor_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = xor <64 x i32> %op1, %op2
@@ -974,31 +1096,34 @@ define void @xor_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: xor_v1i64:
-; CHECK: eor v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = xor <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: xor_v2i64:
-; CHECK: eor v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = xor <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @xor_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @xor_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: xor_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = xor <4 x i64> %op1, %op2
@@ -1007,13 +1132,28 @@ define void @xor_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @xor_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: xor_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: xor_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: eor z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT: eor z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: xor_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = xor <8 x i64> %op1, %op2
@@ -1021,14 +1161,15 @@ define void @xor_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @xor_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @xor_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: xor_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = xor <16 x i64> %op1, %op2
@@ -1036,14 +1177,15 @@ define void @xor_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @xor_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @xor_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: xor_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = xor <32 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
index 065c278299b7..ca8bf9438200 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
@@ -1,55 +1,43 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; SMAX
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smax_v8i8:
-; CHECK: smax v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smax v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smax_v16i8:
-; CHECK: smax v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smax v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
ret <16 x i8> %res
}
-define void @smax_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @smax_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: smax_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: smax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -58,26 +46,28 @@ define void @smax_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @smax_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: smax_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: smax_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: smax z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT: smax z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: smax [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: smax [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_512-LABEL: smax_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smax z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %op1, <64 x i8> %op2)
@@ -85,14 +75,15 @@ define void @smax_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @smax_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @smax_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: smax_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = call <128 x i8> @llvm.smax.v128i8(<128 x i8> %op1, <128 x i8> %op2)
@@ -100,14 +91,15 @@ define void @smax_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @smax_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @smax_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: smax_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = call <256 x i8> @llvm.smax.v256i8(<256 x i8> %op1, <256 x i8> %op2)
@@ -116,31 +108,34 @@ define void @smax_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smax_v4i16:
-; CHECK: smax v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smax v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smax_v8i16:
-; CHECK: smax v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smax v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
ret <8 x i16> %res
}
-define void @smax_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @smax_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: smax_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: smax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -149,26 +144,28 @@ define void @smax_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @smax_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: smax_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: smax [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: smax [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smax_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: smax z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: smax z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: smax_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smax z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %op1, <32 x i16> %op2)
@@ -176,14 +173,15 @@ define void @smax_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @smax_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @smax_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: smax_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = call <64 x i16> @llvm.smax.v64i16(<64 x i16> %op1, <64 x i16> %op2)
@@ -191,14 +189,15 @@ define void @smax_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @smax_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @smax_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: smax_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = call <128 x i16> @llvm.smax.v128i16(<128 x i16> %op1, <128 x i16> %op2)
@@ -207,31 +206,34 @@ define void @smax_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smax_v2i32:
-; CHECK: smax v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smax_v4i32:
-; CHECK: smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
ret <4 x i32> %res
}
-define void @smax_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @smax_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: smax_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: smax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -240,26 +242,28 @@ define void @smax_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @smax_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: smax_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: smax [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: smax [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smax_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: smax z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: smax z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: smax_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smax z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %op1, <16 x i32> %op2)
@@ -267,14 +271,15 @@ define void @smax_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @smax_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @smax_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: smax_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = call <32 x i32> @llvm.smax.v32i32(<32 x i32> %op1, <32 x i32> %op2)
@@ -282,14 +287,15 @@ define void @smax_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @smax_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @smax_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: smax_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = call <64 x i32> @llvm.smax.v64i32(<64 x i32> %op1, <64 x i32> %op2)
@@ -298,33 +304,42 @@ define void @smax_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Vector i64 max are not legal for NEON so use SVE when available.
-define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smax_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: smax z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
ret <1 x i64> %res
}
; Vector i64 max are not legal for NEON so use SVE when available.
-define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smax_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: smax z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
ret <2 x i64> %res
}
-define void @smax_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @smax_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: smax_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: smax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -333,26 +348,28 @@ define void @smax_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @smax_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: smax_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: smax [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: smax [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smax_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: smax z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: smax z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: smax_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smax z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %op1, <8 x i64> %op2)
@@ -360,14 +377,15 @@ define void @smax_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @smax_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @smax_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: smax_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = call <16 x i64> @llvm.smax.v16i64(<16 x i64> %op1, <16 x i64> %op2)
@@ -375,14 +393,15 @@ define void @smax_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @smax_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @smax_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: smax_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = call <32 x i64> @llvm.smax.v32i64(<32 x i64> %op1, <32 x i64> %op2)
@@ -395,31 +414,34 @@ define void @smax_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smin_v8i8:
-; CHECK: smin v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smin v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smin_v16i8:
-; CHECK: smin v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smin v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
ret <16 x i8> %res
}
-define void @smin_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @smin_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: smin_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: smin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -428,25 +450,28 @@ define void @smin_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @smin_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: smin_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: smin_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: smin z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT: smin z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: smin [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: smin [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
+; VBITS_GE_512-LABEL: smin_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smin z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %op1, <64 x i8> %op2)
@@ -454,14 +479,15 @@ define void @smin_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @smin_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @smin_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: smin_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = call <128 x i8> @llvm.smin.v128i8(<128 x i8> %op1, <128 x i8> %op2)
@@ -469,14 +495,15 @@ define void @smin_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @smin_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @smin_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: smin_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = call <256 x i8> @llvm.smin.v256i8(<256 x i8> %op1, <256 x i8> %op2)
@@ -485,31 +512,34 @@ define void @smin_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smin_v4i16:
-; CHECK: smin v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smin v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smin_v8i16:
-; CHECK: smin v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smin v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
ret <8 x i16> %res
}
-define void @smin_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @smin_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: smin_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: smin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -518,26 +548,28 @@ define void @smin_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @smin_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: smin_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: smin [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: smin [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smin_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: smin z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: smin z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: smin_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smin z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %op1, <32 x i16> %op2)
@@ -545,14 +577,15 @@ define void @smin_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @smin_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @smin_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: smin_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = call <64 x i16> @llvm.smin.v64i16(<64 x i16> %op1, <64 x i16> %op2)
@@ -560,14 +593,15 @@ define void @smin_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @smin_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @smin_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: smin_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = call <128 x i16> @llvm.smin.v128i16(<128 x i16> %op1, <128 x i16> %op2)
@@ -576,31 +610,34 @@ define void @smin_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smin_v2i32:
-; CHECK: smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smin_v4i32:
-; CHECK: smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
ret <4 x i32> %res
}
-define void @smin_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @smin_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: smin_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: smin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -609,26 +646,28 @@ define void @smin_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @smin_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: smin_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: smin [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: smin [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smin_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: smin z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: smin z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: smin_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smin z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %op1, <16 x i32> %op2)
@@ -636,14 +675,15 @@ define void @smin_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @smin_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @smin_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: smin_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = call <32 x i32> @llvm.smin.v32i32(<32 x i32> %op1, <32 x i32> %op2)
@@ -651,14 +691,15 @@ define void @smin_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @smin_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @smin_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: smin_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = call <64 x i32> @llvm.smin.v64i32(<64 x i32> %op1, <64 x i32> %op2)
@@ -667,33 +708,42 @@ define void @smin_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Vector i64 min are not legal for NEON so use SVE when available.
-define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smin_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: smin z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
ret <1 x i64> %res
}
; Vector i64 min are not legal for NEON so use SVE when available.
-define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smin_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: smin z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
ret <2 x i64> %res
}
-define void @smin_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @smin_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: smin_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: smin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -702,26 +752,28 @@ define void @smin_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @smin_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: smin_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: smin [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: smin [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smin_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: smin z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: smin z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: smin_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smin z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %op1, <8 x i64> %op2)
@@ -729,14 +781,15 @@ define void @smin_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @smin_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @smin_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: smin_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = call <16 x i64> @llvm.smin.v16i64(<16 x i64> %op1, <16 x i64> %op2)
@@ -744,14 +797,15 @@ define void @smin_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @smin_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @smin_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: smin_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = call <32 x i64> @llvm.smin.v32i64(<32 x i64> %op1, <32 x i64> %op2)
@@ -764,31 +818,34 @@ define void @smin_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umax_v8i8:
-; CHECK: umax v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umax v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umax_v16i8:
-; CHECK: umax v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umax v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
ret <16 x i8> %res
}
-define void @umax_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @umax_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: umax_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -797,26 +854,28 @@ define void @umax_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @umax_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: umax_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: umax_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: umax z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT: umax z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_512-LABEL: umax_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umax z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %op1, <64 x i8> %op2)
@@ -824,14 +883,15 @@ define void @umax_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @umax_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @umax_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: umax_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = call <128 x i8> @llvm.umax.v128i8(<128 x i8> %op1, <128 x i8> %op2)
@@ -839,14 +899,15 @@ define void @umax_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @umax_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @umax_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: umax_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = call <256 x i8> @llvm.umax.v256i8(<256 x i8> %op1, <256 x i8> %op2)
@@ -855,31 +916,34 @@ define void @umax_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umax_v4i16:
-; CHECK: umax v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umax v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umax_v8i16:
-; CHECK: umax v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umax v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
ret <8 x i16> %res
}
-define void @umax_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @umax_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: umax_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -888,26 +952,28 @@ define void @umax_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @umax_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: umax_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umax_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: umax z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: umax z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: umax_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umax z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %op1, <32 x i16> %op2)
@@ -915,14 +981,15 @@ define void @umax_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @umax_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @umax_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: umax_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = call <64 x i16> @llvm.umax.v64i16(<64 x i16> %op1, <64 x i16> %op2)
@@ -930,14 +997,15 @@ define void @umax_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @umax_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @umax_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: umax_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = call <128 x i16> @llvm.umax.v128i16(<128 x i16> %op1, <128 x i16> %op2)
@@ -946,31 +1014,34 @@ define void @umax_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umax_v2i32:
-; CHECK: umax v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umax_v4i32:
-; CHECK: umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
ret <4 x i32> %res
}
-define void @umax_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @umax_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: umax_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -979,26 +1050,28 @@ define void @umax_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @umax_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: umax_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umax_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: umax z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: umax z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: umax_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umax z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %op1, <16 x i32> %op2)
@@ -1006,14 +1079,15 @@ define void @umax_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @umax_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @umax_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: umax_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = call <32 x i32> @llvm.umax.v32i32(<32 x i32> %op1, <32 x i32> %op2)
@@ -1021,14 +1095,15 @@ define void @umax_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @umax_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @umax_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: umax_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = call <64 x i32> @llvm.umax.v64i32(<64 x i32> %op1, <64 x i32> %op2)
@@ -1037,33 +1112,42 @@ define void @umax_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Vector i64 max are not legal for NEON so use SVE when available.
-define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umax_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: umax z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
ret <1 x i64> %res
}
; Vector i64 max are not legal for NEON so use SVE when available.
-define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umax_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: umax z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
ret <2 x i64> %res
}
-define void @umax_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @umax_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: umax_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -1072,26 +1156,28 @@ define void @umax_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @umax_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: umax_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umax_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: umax z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: umax z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: umax_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umax z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %op1, <8 x i64> %op2)
@@ -1099,14 +1185,15 @@ define void @umax_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @umax_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @umax_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: umax_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = call <16 x i64> @llvm.umax.v16i64(<16 x i64> %op1, <16 x i64> %op2)
@@ -1114,14 +1201,15 @@ define void @umax_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @umax_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @umax_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: umax_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = call <32 x i64> @llvm.umax.v32i64(<32 x i64> %op1, <32 x i64> %op2)
@@ -1134,31 +1222,34 @@ define void @umax_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umin_v8i8:
-; CHECK: umin v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umin v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umin_v16i8:
-; CHECK: umin v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umin v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
ret <16 x i8> %res
}
-define void @umin_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @umin_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: umin_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -1167,25 +1258,28 @@ define void @umin_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @umin_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: umin_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: umin_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: umin z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT: umin z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
+; VBITS_GE_512-LABEL: umin_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umin z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %op1, <64 x i8> %op2)
@@ -1193,14 +1287,15 @@ define void @umin_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @umin_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @umin_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: umin_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = call <128 x i8> @llvm.umin.v128i8(<128 x i8> %op1, <128 x i8> %op2)
@@ -1208,14 +1303,15 @@ define void @umin_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @umin_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @umin_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: umin_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = call <256 x i8> @llvm.umin.v256i8(<256 x i8> %op1, <256 x i8> %op2)
@@ -1224,31 +1320,34 @@ define void @umin_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umin_v4i16:
-; CHECK: umin v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umin v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umin_v8i16:
-; CHECK: umin v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umin v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
ret <8 x i16> %res
}
-define void @umin_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @umin_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: umin_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -1257,26 +1356,28 @@ define void @umin_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @umin_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: umin_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umin_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: umin z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: umin z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: umin_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umin z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %op1, <32 x i16> %op2)
@@ -1284,14 +1385,15 @@ define void @umin_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @umin_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @umin_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: umin_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = call <64 x i16> @llvm.umin.v64i16(<64 x i16> %op1, <64 x i16> %op2)
@@ -1299,14 +1401,15 @@ define void @umin_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @umin_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @umin_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: umin_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = call <128 x i16> @llvm.umin.v128i16(<128 x i16> %op1, <128 x i16> %op2)
@@ -1315,31 +1418,34 @@ define void @umin_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umin_v2i32:
-; CHECK: umin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umin_v4i32:
-; CHECK: umin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
ret <4 x i32> %res
}
-define void @umin_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @umin_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: umin_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -1348,26 +1454,28 @@ define void @umin_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @umin_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: umin_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umin_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: umin z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: umin z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: umin_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umin z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %op1, <16 x i32> %op2)
@@ -1375,14 +1483,15 @@ define void @umin_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @umin_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @umin_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: umin_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = call <32 x i32> @llvm.umin.v32i32(<32 x i32> %op1, <32 x i32> %op2)
@@ -1390,14 +1499,15 @@ define void @umin_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @umin_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @umin_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: umin_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = call <64 x i32> @llvm.umin.v64i32(<64 x i32> %op1, <64 x i32> %op2)
@@ -1406,33 +1516,42 @@ define void @umin_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Vector i64 min are not legal for NEON so use SVE when available.
-define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umin_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: umin z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
ret <1 x i64> %res
}
; Vector i64 min are not legal for NEON so use SVE when available.
-define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umin_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: umin z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
ret <2 x i64> %res
}
-define void @umin_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @umin_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: umin_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -1441,26 +1560,28 @@ define void @umin_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @umin_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: umin_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umin_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: umin z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: umin z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: umin_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umin z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %op1, <8 x i64> %op2)
@@ -1468,14 +1589,15 @@ define void @umin_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @umin_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @umin_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: umin_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = call <16 x i64> @llvm.umin.v16i64(<16 x i64> %op1, <16 x i64> %op2)
@@ -1483,14 +1605,15 @@ define void @umin_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @umin_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @umin_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: umin_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = call <32 x i64> @llvm.umin.v32i64(<32 x i64> %op1, <32 x i64> %op2)
@@ -1599,4 +1722,3 @@ declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>)
declare <8 x i64> @llvm.umax.v8i64(<8 x i64>, <8 x i64>)
declare <16 x i64> @llvm.umax.v16i64(<16 x i64>, <16 x i64>)
declare <32 x i64> @llvm.umax.v32i64(<32 x i64>, <32 x i64>)
-
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
index 32dc75e19d7f..b050a4dcfcdb 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
@@ -1,25 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefixes=CHECK,VBITS_EQ_128
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
-
-; VBYTES represents the useful byte size of a vector register from the code
-; generator's point of view. It is clamped to power-of-2 values because
-; only power-of-2 vector lengths are considered legal, regardless of the
-; user specified vector length.
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; This test only tests the legal types for a given vector width, as mulh nodes
; do not get generated for non-legal types.
@@ -32,35 +14,29 @@ target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE for 64-bit vectors.
; FIXME: The codegen for the >=256 bits case can be improved.
-define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
-; VBITS_EQ_128-LABEL: smulh_v8i8:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: smull v0.8h, v0.8b, v1.8b
-; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: smulh_v8i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: smull v0.8h, v0.8b, v1.8b
-; VBITS_GE_256-NEXT: ushr v1.8h, v0.8h, #8
-; VBITS_GE_256-NEXT: umov w8, v1.h[0]
-; VBITS_GE_256-NEXT: umov w9, v1.h[1]
-; VBITS_GE_256-NEXT: fmov s0, w8
-; VBITS_GE_256-NEXT: umov w8, v1.h[2]
-; VBITS_GE_256-NEXT: mov v0.b[1], w9
-; VBITS_GE_256-NEXT: mov v0.b[2], w8
-; VBITS_GE_256-NEXT: umov w8, v1.h[3]
-; VBITS_GE_256-NEXT: mov v0.b[3], w8
-; VBITS_GE_256-NEXT: umov w8, v1.h[4]
-; VBITS_GE_256-NEXT: mov v0.b[4], w8
-; VBITS_GE_256-NEXT: umov w8, v1.h[5]
-; VBITS_GE_256-NEXT: mov v0.b[5], w8
-; VBITS_GE_256-NEXT: umov w8, v1.h[6]
-; VBITS_GE_256-NEXT: mov v0.b[6], w8
-; VBITS_GE_256-NEXT: umov w8, v1.h[7]
-; VBITS_GE_256-NEXT: mov v0.b[7], w8
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
-; VBITS_GE_256-NEXT: ret
+define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: ushr v1.8h, v0.8h, #8
+; CHECK-NEXT: umov w8, v1.h[0]
+; CHECK-NEXT: umov w9, v1.h[1]
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: umov w8, v1.h[2]
+; CHECK-NEXT: mov v0.b[1], w9
+; CHECK-NEXT: mov v0.b[2], w8
+; CHECK-NEXT: umov w8, v1.h[3]
+; CHECK-NEXT: mov v0.b[3], w8
+; CHECK-NEXT: umov w8, v1.h[4]
+; CHECK-NEXT: mov v0.b[4], w8
+; CHECK-NEXT: umov w8, v1.h[5]
+; CHECK-NEXT: mov v0.b[5], w8
+; CHECK-NEXT: umov w8, v1.h[6]
+; CHECK-NEXT: mov v0.b[6], w8
+; CHECK-NEXT: umov w8, v1.h[7]
+; CHECK-NEXT: mov v0.b[7], w8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%insert = insertelement <8 x i16> undef, i16 8, i64 0
%splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
%1 = sext <8 x i8> %op1 to <8 x i16>
@@ -72,7 +48,7 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smulh_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: smull2 v2.8h, v0.16b, v1.16b
@@ -87,30 +63,15 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
ret <16 x i8> %res
}
-define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v32i8:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT: smull v4.8h, v0.8b, v2.8b
-; VBITS_EQ_128-NEXT: smull2 v0.8h, v0.16b, v2.16b
-; VBITS_EQ_128-NEXT: smull v5.8h, v1.8b, v3.8b
-; VBITS_EQ_128-NEXT: smull2 v1.8h, v1.16b, v3.16b
-; VBITS_EQ_128-NEXT: shrn v2.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT: shrn v3.8b, v5.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v2.16b, v0.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v3.16b, v1.8h, #8
-; VBITS_EQ_128-NEXT: stp q2, q3, [x0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: smulh_v32i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_256-NEXT: smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
+define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v32i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%1 = sext <32 x i8> %op1 to <32 x i16>
@@ -123,40 +84,56 @@ define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v64i8:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #32]
-; VBITS_EQ_128-NEXT: smull2 v6.8h, v0.16b, v2.16b
-; VBITS_EQ_128-NEXT: smull v0.8h, v0.8b, v2.8b
-; VBITS_EQ_128-NEXT: ldp q3, q4, [x0]
-; VBITS_EQ_128-NEXT: smull2 v7.8h, v1.16b, v5.16b
-; VBITS_EQ_128-NEXT: smull v1.8h, v1.8b, v5.8b
-; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v0.16b, v6.8h, #8
-; VBITS_EQ_128-NEXT: ldp q2, q5, [x1]
-; VBITS_EQ_128-NEXT: shrn v1.8b, v1.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v1.16b, v7.8h, #8
-; VBITS_EQ_128-NEXT: smull2 v16.8h, v3.16b, v2.16b
-; VBITS_EQ_128-NEXT: smull v2.8h, v3.8b, v2.8b
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT: smull2 v3.8h, v4.16b, v5.16b
-; VBITS_EQ_128-NEXT: smull v4.8h, v4.8b, v5.8b
-; VBITS_EQ_128-NEXT: shrn v2.8b, v2.8h, #8
-; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v2.16b, v16.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v4.16b, v3.8h, #8
-; VBITS_EQ_128-NEXT: stp q2, q4, [x0]
-; VBITS_EQ_128-NEXT: ret
+; VBITS_GE_256-LABEL: smulh_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ptrue p1.h, vl16
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: sunpklo z4.h, z0.b
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z5.h, z1.b
+; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT: sunpklo z6.h, z2.b
+; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT: sunpklo z7.h, z3.b
+; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
+; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT: sunpklo z2.h, z2.b
+; VBITS_GE_256-NEXT: sunpklo z3.h, z3.b
+; VBITS_GE_256-NEXT: mul z0.h, p1/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: movprfx z2, z5
+; VBITS_GE_256-NEXT: mul z2.h, p1/m, z2.h, z7.h
+; VBITS_GE_256-NEXT: mul z1.h, p1/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: mul z4.h, p1/m, z4.h, z6.h
+; VBITS_GE_256-NEXT: lsr z0.h, p1/m, z0.h, #8
+; VBITS_GE_256-NEXT: movprfx z3, z4
+; VBITS_GE_256-NEXT: lsr z3.h, p1/m, z3.h, #8
+; VBITS_GE_256-NEXT: lsr z1.h, p1/m, z1.h, #8
+; VBITS_GE_256-NEXT: lsr z2.h, p1/m, z2.h, #8
+; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: ptrue p1.b, vl16
+; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: splice z3.b, p1, z3.b, z0.b
+; VBITS_GE_256-NEXT: splice z2.b, p1, z2.b, z1.b
+; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; VBITS_GE_1024-LABEL: smulh_v64i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.b, vl64
-; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+; VBITS_GE_512-LABEL: smulh_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%insert = insertelement <64 x i16> undef, i16 8, i64 0
@@ -170,64 +147,15 @@ define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v128i8:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #96]
-; VBITS_EQ_128-NEXT: smull2 v6.8h, v0.16b, v2.16b
-; VBITS_EQ_128-NEXT: smull v0.8h, v0.8b, v2.8b
-; VBITS_EQ_128-NEXT: ldp q3, q4, [x0, #64]
-; VBITS_EQ_128-NEXT: smull2 v7.8h, v1.16b, v5.16b
-; VBITS_EQ_128-NEXT: smull v1.8h, v1.8b, v5.8b
-; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v0.16b, v6.8h, #8
-; VBITS_EQ_128-NEXT: ldp q2, q16, [x1, #64]
-; VBITS_EQ_128-NEXT: shrn v1.8b, v1.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v1.16b, v7.8h, #8
-; VBITS_EQ_128-NEXT: smull2 v17.8h, v3.16b, v2.16b
-; VBITS_EQ_128-NEXT: smull v2.8h, v3.8b, v2.8b
-; VBITS_EQ_128-NEXT: ldp q5, q18, [x0, #32]
-; VBITS_EQ_128-NEXT: smull2 v19.8h, v4.16b, v16.16b
-; VBITS_EQ_128-NEXT: smull v4.8h, v4.8b, v16.8b
-; VBITS_EQ_128-NEXT: shrn v2.8b, v2.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v2.16b, v17.8h, #8
-; VBITS_EQ_128-NEXT: ldp q3, q20, [x1, #32]
-; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v4.16b, v19.8h, #8
-; VBITS_EQ_128-NEXT: smull2 v21.8h, v5.16b, v3.16b
-; VBITS_EQ_128-NEXT: smull v3.8h, v5.8b, v3.8b
-; VBITS_EQ_128-NEXT: ldp q16, q22, [x0]
-; VBITS_EQ_128-NEXT: smull2 v23.8h, v18.16b, v20.16b
-; VBITS_EQ_128-NEXT: smull v18.8h, v18.8b, v20.8b
-; VBITS_EQ_128-NEXT: shrn v3.8b, v3.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v3.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT: ldp q5, q24, [x1]
-; VBITS_EQ_128-NEXT: shrn v18.8b, v18.8h, #8
-; VBITS_EQ_128-NEXT: stp q2, q4, [x0, #64]
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: shrn2 v18.16b, v23.8h, #8
-; VBITS_EQ_128-NEXT: smull v20.8h, v16.8b, v5.8b
-; VBITS_EQ_128-NEXT: smull2 v5.8h, v16.16b, v5.16b
-; VBITS_EQ_128-NEXT: stp q3, q18, [x0, #32]
-; VBITS_EQ_128-NEXT: smull v25.8h, v22.8b, v24.8b
-; VBITS_EQ_128-NEXT: smull2 v16.8h, v22.16b, v24.16b
-; VBITS_EQ_128-NEXT: shrn v20.8b, v20.8h, #8
-; VBITS_EQ_128-NEXT: shrn v22.8b, v25.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v20.16b, v5.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v22.16b, v16.8h, #8
-; VBITS_EQ_128-NEXT: stp q20, q22, [x0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: smulh_v128i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
-
+define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: smulh_v128i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%1 = sext <128 x i8> %op1 to <128 x i16>
@@ -239,130 +167,15 @@ define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v256i8:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: sub sp, sp, #96
-; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 96
-; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_offset b8, -8
-; VBITS_EQ_128-NEXT: .cfi_offset b9, -16
-; VBITS_EQ_128-NEXT: .cfi_offset b10, -24
-; VBITS_EQ_128-NEXT: .cfi_offset b11, -32
-; VBITS_EQ_128-NEXT: .cfi_offset b12, -40
-; VBITS_EQ_128-NEXT: .cfi_offset b13, -48
-; VBITS_EQ_128-NEXT: .cfi_offset b14, -56
-; VBITS_EQ_128-NEXT: .cfi_offset b15, -64
-; VBITS_EQ_128-NEXT: ldp q2, q1, [x0, #224]
-; VBITS_EQ_128-NEXT: ldp q6, q3, [x1, #224]
-; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT: smull2 v0.8h, v1.16b, v3.16b
-; VBITS_EQ_128-NEXT: smull v4.8h, v1.8b, v3.8b
-; VBITS_EQ_128-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: ldp q16, q3, [x1, #192]
-; VBITS_EQ_128-NEXT: smull2 v0.8h, v2.16b, v6.16b
-; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT: smull v6.8h, v2.8b, v6.8b
-; VBITS_EQ_128-NEXT: str q0, [sp] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: smull2 v2.8h, v5.16b, v3.16b
-; VBITS_EQ_128-NEXT: shrn v6.8b, v6.8h, #8
-; VBITS_EQ_128-NEXT: smull v5.8h, v5.8b, v3.8b
-; VBITS_EQ_128-NEXT: ldp q19, q18, [x0, #160]
-; VBITS_EQ_128-NEXT: smull2 v3.8h, v7.16b, v16.16b
-; VBITS_EQ_128-NEXT: smull v7.8h, v7.8b, v16.8b
-; VBITS_EQ_128-NEXT: shrn v5.8b, v5.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v5.16b, v2.8h, #8
-; VBITS_EQ_128-NEXT: ldp q16, q17, [x1, #160]
-; VBITS_EQ_128-NEXT: shrn v7.8b, v7.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v7.16b, v3.8h, #8
-; VBITS_EQ_128-NEXT: smull2 v31.8h, v19.16b, v16.16b
-; VBITS_EQ_128-NEXT: smull v9.8h, v19.8b, v16.8b
-; VBITS_EQ_128-NEXT: smull2 v21.8h, v18.16b, v17.16b
-; VBITS_EQ_128-NEXT: smull v30.8h, v18.8b, v17.8b
-; VBITS_EQ_128-NEXT: ldp q22, q17, [x0, #128]
-; VBITS_EQ_128-NEXT: shrn v9.8b, v9.8h, #8
-; VBITS_EQ_128-NEXT: shrn v30.8b, v30.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v9.16b, v31.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v30.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT: ldp q19, q20, [x1, #128]
-; VBITS_EQ_128-NEXT: smull2 v16.8h, v17.16b, v20.16b
-; VBITS_EQ_128-NEXT: ldr q21, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: smull v18.8h, v17.8b, v20.8b
-; VBITS_EQ_128-NEXT: ldp q24, q20, [x0, #96]
-; VBITS_EQ_128-NEXT: smull2 v17.8h, v22.16b, v19.16b
-; VBITS_EQ_128-NEXT: shrn2 v4.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT: smull v19.8h, v22.8b, v19.8b
-; VBITS_EQ_128-NEXT: shrn v2.8b, v18.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v2.16b, v16.8h, #8
-; VBITS_EQ_128-NEXT: ldp q22, q23, [x1, #96]
-; VBITS_EQ_128-NEXT: shrn v3.8b, v19.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v3.16b, v17.8h, #8
-; VBITS_EQ_128-NEXT: smull2 v12.8h, v24.16b, v22.16b
-; VBITS_EQ_128-NEXT: smull v13.8h, v24.8b, v22.8b
-; VBITS_EQ_128-NEXT: smull2 v10.8h, v20.16b, v23.16b
-; VBITS_EQ_128-NEXT: ldr q21, [sp] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: smull v11.8h, v20.8b, v23.8b
-; VBITS_EQ_128-NEXT: ldp q26, q23, [x0, #64]
-; VBITS_EQ_128-NEXT: shrn2 v6.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT: ldp q24, q25, [x1, #64]
-; VBITS_EQ_128-NEXT: smull2 v22.8h, v26.16b, v24.16b
-; VBITS_EQ_128-NEXT: smull v24.8h, v26.8b, v24.8b
-; VBITS_EQ_128-NEXT: smull2 v20.8h, v23.16b, v25.16b
-; VBITS_EQ_128-NEXT: smull v23.8h, v23.8b, v25.8b
-; VBITS_EQ_128-NEXT: ldp q28, q25, [x0, #32]
-; VBITS_EQ_128-NEXT: ldp q26, q27, [x1, #32]
-; VBITS_EQ_128-NEXT: smull2 v15.8h, v28.16b, v26.16b
-; VBITS_EQ_128-NEXT: smull v1.8h, v28.8b, v26.8b
-; VBITS_EQ_128-NEXT: smull2 v14.8h, v25.16b, v27.16b
-; VBITS_EQ_128-NEXT: smull v8.8h, v25.8b, v27.8b
-; VBITS_EQ_128-NEXT: ldp q0, q27, [x0]
-; VBITS_EQ_128-NEXT: shrn v8.8b, v8.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v8.16b, v14.8h, #8
-; VBITS_EQ_128-NEXT: ldp q28, q29, [x1]
-; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #128]
-; VBITS_EQ_128-NEXT: shrn v2.8b, v23.8h, #8
-; VBITS_EQ_128-NEXT: stp q9, q30, [x0, #160]
-; VBITS_EQ_128-NEXT: shrn v3.8b, v24.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v2.16b, v20.8h, #8
-; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT: smull2 v26.8h, v0.16b, v28.16b
-; VBITS_EQ_128-NEXT: shrn2 v3.16b, v22.8h, #8
-; VBITS_EQ_128-NEXT: smull v28.8h, v0.8b, v28.8b
-; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #224]
-; VBITS_EQ_128-NEXT: smull2 v25.8h, v27.16b, v29.16b
-; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #64]
-; VBITS_EQ_128-NEXT: smull v27.8h, v27.8b, v29.8b
-; VBITS_EQ_128-NEXT: shrn v29.8b, v1.8h, #8
-; VBITS_EQ_128-NEXT: shrn v0.8b, v13.8h, #8
-; VBITS_EQ_128-NEXT: shrn v1.8b, v11.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v29.16b, v15.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v0.16b, v12.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v1.16b, v10.8h, #8
-; VBITS_EQ_128-NEXT: stp q29, q8, [x0, #32]
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: shrn v0.8b, v27.8h, #8
-; VBITS_EQ_128-NEXT: shrn v1.8b, v28.8h, #8
-; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: shrn2 v0.16b, v25.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v1.16b, v26.8h, #8
-; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: stp q1, q0, [x0]
-; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add sp, sp, #96
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: smulh_v256i8:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: smulh_v256i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%1 = sext <256 x i8> %op1 to <256 x i16>
@@ -376,26 +189,20 @@ define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
; Don't use SVE for 64-bit vectors.
; FIXME: The codegen for the >=256 bits case can be improved.
-define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
-; VBITS_EQ_128-LABEL: smulh_v4i16:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: smull v0.4s, v0.4h, v1.4h
-; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: smulh_v4i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: smull v0.4s, v0.4h, v1.4h
-; VBITS_GE_256-NEXT: ushr v1.4s, v0.4s, #16
-; VBITS_GE_256-NEXT: mov w8, v1.s[1]
-; VBITS_GE_256-NEXT: mov w9, v1.s[2]
-; VBITS_GE_256-NEXT: mov v0.16b, v1.16b
-; VBITS_GE_256-NEXT: mov v0.h[1], w8
-; VBITS_GE_256-NEXT: mov w8, v1.s[3]
-; VBITS_GE_256-NEXT: mov v0.h[2], w9
-; VBITS_GE_256-NEXT: mov v0.h[3], w8
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
-; VBITS_GE_256-NEXT: ret
+define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: ushr v1.4s, v0.4s, #16
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: mov w9, v1.s[2]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: mov v0.h[1], w8
+; CHECK-NEXT: mov w8, v1.s[3]
+; CHECK-NEXT: mov v0.h[2], w9
+; CHECK-NEXT: mov v0.h[3], w8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%1 = sext <4 x i16> %op1 to <4 x i32>
%2 = sext <4 x i16> %op2 to <4 x i32>
%mul = mul <4 x i32> %1, %2
@@ -405,7 +212,7 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smulh_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h
@@ -420,30 +227,15 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
ret <8 x i16> %res
}
-define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v16i16:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT: smull v4.4s, v0.4h, v2.4h
-; VBITS_EQ_128-NEXT: smull2 v0.4s, v0.8h, v2.8h
-; VBITS_EQ_128-NEXT: smull v5.4s, v1.4h, v3.4h
-; VBITS_EQ_128-NEXT: smull2 v1.4s, v1.8h, v3.8h
-; VBITS_EQ_128-NEXT: shrn v2.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT: shrn v3.4h, v5.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v2.8h, v0.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v3.8h, v1.4s, #16
-; VBITS_EQ_128-NEXT: stp q2, q3, [x0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: smulh_v16i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
+define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%1 = sext <16 x i16> %op1 to <16 x i32>
@@ -456,40 +248,47 @@ define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v32i16:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #32]
-; VBITS_EQ_128-NEXT: smull2 v6.4s, v0.8h, v2.8h
-; VBITS_EQ_128-NEXT: smull v0.4s, v0.4h, v2.4h
-; VBITS_EQ_128-NEXT: ldp q3, q4, [x0]
-; VBITS_EQ_128-NEXT: smull2 v7.4s, v1.8h, v5.8h
-; VBITS_EQ_128-NEXT: smull v1.4s, v1.4h, v5.4h
-; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v0.8h, v6.4s, #16
-; VBITS_EQ_128-NEXT: ldp q2, q5, [x1]
-; VBITS_EQ_128-NEXT: shrn v1.4h, v1.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v1.8h, v7.4s, #16
-; VBITS_EQ_128-NEXT: smull2 v16.4s, v3.8h, v2.8h
-; VBITS_EQ_128-NEXT: smull v2.4s, v3.4h, v2.4h
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT: smull2 v3.4s, v4.8h, v5.8h
-; VBITS_EQ_128-NEXT: smull v4.4s, v4.4h, v5.4h
-; VBITS_EQ_128-NEXT: shrn v2.4h, v2.4s, #16
-; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v2.8h, v16.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v4.8h, v3.4s, #16
-; VBITS_EQ_128-NEXT: stp q2, q4, [x0]
-; VBITS_EQ_128-NEXT: ret
+; VBITS_GE_256-LABEL: smulh_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ptrue p1.h, vl8
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov z7.d, z1.d
+; VBITS_GE_256-NEXT: mov z16.d, z3.d
+; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16
+; VBITS_GE_256-NEXT: smull2 v4.4s, v0.8h, v2.8h
+; VBITS_GE_256-NEXT: ext z16.b, z16.b, z3.b, #16
+; VBITS_GE_256-NEXT: smull v5.4s, v0.4h, v2.4h
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT: smull2 v6.4s, v1.8h, v3.8h
+; VBITS_GE_256-NEXT: smull v1.4s, v1.4h, v3.4h
+; VBITS_GE_256-NEXT: smull2 v3.4s, v0.8h, v2.8h
+; VBITS_GE_256-NEXT: smull v0.4s, v0.4h, v2.4h
+; VBITS_GE_256-NEXT: smull2 v2.4s, v7.8h, v16.8h
+; VBITS_GE_256-NEXT: smull v7.4s, v7.4h, v16.4h
+; VBITS_GE_256-NEXT: uzp2 v4.8h, v5.8h, v4.8h
+; VBITS_GE_256-NEXT: uzp2 v1.8h, v1.8h, v6.8h
+; VBITS_GE_256-NEXT: uzp2 v0.8h, v0.8h, v3.8h
+; VBITS_GE_256-NEXT: uzp2 v2.8h, v7.8h, v2.8h
+; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z0.h
+; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h
+; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; VBITS_GE_1024-LABEL: smulh_v32i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+; VBITS_GE_512-LABEL: smulh_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%1 = sext <32 x i16> %op1 to <32 x i32>
@@ -501,63 +300,15 @@ define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v64i16:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #96]
-; VBITS_EQ_128-NEXT: smull2 v6.4s, v0.8h, v2.8h
-; VBITS_EQ_128-NEXT: smull v0.4s, v0.4h, v2.4h
-; VBITS_EQ_128-NEXT: ldp q3, q4, [x0, #64]
-; VBITS_EQ_128-NEXT: smull2 v7.4s, v1.8h, v5.8h
-; VBITS_EQ_128-NEXT: smull v1.4s, v1.4h, v5.4h
-; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v0.8h, v6.4s, #16
-; VBITS_EQ_128-NEXT: ldp q2, q16, [x1, #64]
-; VBITS_EQ_128-NEXT: shrn v1.4h, v1.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v1.8h, v7.4s, #16
-; VBITS_EQ_128-NEXT: smull2 v17.4s, v3.8h, v2.8h
-; VBITS_EQ_128-NEXT: smull v2.4s, v3.4h, v2.4h
-; VBITS_EQ_128-NEXT: ldp q5, q18, [x0, #32]
-; VBITS_EQ_128-NEXT: smull2 v19.4s, v4.8h, v16.8h
-; VBITS_EQ_128-NEXT: smull v4.4s, v4.4h, v16.4h
-; VBITS_EQ_128-NEXT: shrn v2.4h, v2.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v2.8h, v17.4s, #16
-; VBITS_EQ_128-NEXT: ldp q3, q20, [x1, #32]
-; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v4.8h, v19.4s, #16
-; VBITS_EQ_128-NEXT: smull2 v21.4s, v5.8h, v3.8h
-; VBITS_EQ_128-NEXT: smull v3.4s, v5.4h, v3.4h
-; VBITS_EQ_128-NEXT: ldp q16, q22, [x0]
-; VBITS_EQ_128-NEXT: smull2 v23.4s, v18.8h, v20.8h
-; VBITS_EQ_128-NEXT: smull v18.4s, v18.4h, v20.4h
-; VBITS_EQ_128-NEXT: shrn v3.4h, v3.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v3.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT: ldp q5, q24, [x1]
-; VBITS_EQ_128-NEXT: shrn v18.4h, v18.4s, #16
-; VBITS_EQ_128-NEXT: stp q2, q4, [x0, #64]
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: shrn2 v18.8h, v23.4s, #16
-; VBITS_EQ_128-NEXT: smull v20.4s, v16.4h, v5.4h
-; VBITS_EQ_128-NEXT: smull2 v5.4s, v16.8h, v5.8h
-; VBITS_EQ_128-NEXT: stp q3, q18, [x0, #32]
-; VBITS_EQ_128-NEXT: smull v25.4s, v22.4h, v24.4h
-; VBITS_EQ_128-NEXT: smull2 v16.4s, v22.8h, v24.8h
-; VBITS_EQ_128-NEXT: shrn v20.4h, v20.4s, #16
-; VBITS_EQ_128-NEXT: shrn v22.4h, v25.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v20.8h, v5.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v22.8h, v16.4s, #16
-; VBITS_EQ_128-NEXT: stp q20, q22, [x0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: smulh_v64i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: smulh_v64i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%1 = sext <64 x i16> %op1 to <64 x i32>
@@ -569,130 +320,15 @@ define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v128i16:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: sub sp, sp, #96
-; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 96
-; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_offset b8, -8
-; VBITS_EQ_128-NEXT: .cfi_offset b9, -16
-; VBITS_EQ_128-NEXT: .cfi_offset b10, -24
-; VBITS_EQ_128-NEXT: .cfi_offset b11, -32
-; VBITS_EQ_128-NEXT: .cfi_offset b12, -40
-; VBITS_EQ_128-NEXT: .cfi_offset b13, -48
-; VBITS_EQ_128-NEXT: .cfi_offset b14, -56
-; VBITS_EQ_128-NEXT: .cfi_offset b15, -64
-; VBITS_EQ_128-NEXT: ldp q2, q1, [x0, #224]
-; VBITS_EQ_128-NEXT: ldp q6, q3, [x1, #224]
-; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT: smull2 v0.4s, v1.8h, v3.8h
-; VBITS_EQ_128-NEXT: smull v4.4s, v1.4h, v3.4h
-; VBITS_EQ_128-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: ldp q16, q3, [x1, #192]
-; VBITS_EQ_128-NEXT: smull2 v0.4s, v2.8h, v6.8h
-; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT: smull v6.4s, v2.4h, v6.4h
-; VBITS_EQ_128-NEXT: str q0, [sp] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: smull2 v2.4s, v5.8h, v3.8h
-; VBITS_EQ_128-NEXT: shrn v6.4h, v6.4s, #16
-; VBITS_EQ_128-NEXT: smull v5.4s, v5.4h, v3.4h
-; VBITS_EQ_128-NEXT: ldp q19, q18, [x0, #160]
-; VBITS_EQ_128-NEXT: smull2 v3.4s, v7.8h, v16.8h
-; VBITS_EQ_128-NEXT: smull v7.4s, v7.4h, v16.4h
-; VBITS_EQ_128-NEXT: shrn v5.4h, v5.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v5.8h, v2.4s, #16
-; VBITS_EQ_128-NEXT: ldp q16, q17, [x1, #160]
-; VBITS_EQ_128-NEXT: shrn v7.4h, v7.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v7.8h, v3.4s, #16
-; VBITS_EQ_128-NEXT: smull2 v31.4s, v19.8h, v16.8h
-; VBITS_EQ_128-NEXT: smull v9.4s, v19.4h, v16.4h
-; VBITS_EQ_128-NEXT: smull2 v21.4s, v18.8h, v17.8h
-; VBITS_EQ_128-NEXT: smull v30.4s, v18.4h, v17.4h
-; VBITS_EQ_128-NEXT: ldp q22, q17, [x0, #128]
-; VBITS_EQ_128-NEXT: shrn v9.4h, v9.4s, #16
-; VBITS_EQ_128-NEXT: shrn v30.4h, v30.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v9.8h, v31.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v30.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT: ldp q19, q20, [x1, #128]
-; VBITS_EQ_128-NEXT: smull2 v16.4s, v17.8h, v20.8h
-; VBITS_EQ_128-NEXT: ldr q21, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: smull v18.4s, v17.4h, v20.4h
-; VBITS_EQ_128-NEXT: ldp q24, q20, [x0, #96]
-; VBITS_EQ_128-NEXT: smull2 v17.4s, v22.8h, v19.8h
-; VBITS_EQ_128-NEXT: shrn2 v4.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT: smull v19.4s, v22.4h, v19.4h
-; VBITS_EQ_128-NEXT: shrn v2.4h, v18.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v2.8h, v16.4s, #16
-; VBITS_EQ_128-NEXT: ldp q22, q23, [x1, #96]
-; VBITS_EQ_128-NEXT: shrn v3.4h, v19.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v3.8h, v17.4s, #16
-; VBITS_EQ_128-NEXT: smull2 v12.4s, v24.8h, v22.8h
-; VBITS_EQ_128-NEXT: smull v13.4s, v24.4h, v22.4h
-; VBITS_EQ_128-NEXT: smull2 v10.4s, v20.8h, v23.8h
-; VBITS_EQ_128-NEXT: ldr q21, [sp] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: smull v11.4s, v20.4h, v23.4h
-; VBITS_EQ_128-NEXT: ldp q26, q23, [x0, #64]
-; VBITS_EQ_128-NEXT: shrn2 v6.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT: ldp q24, q25, [x1, #64]
-; VBITS_EQ_128-NEXT: smull2 v22.4s, v26.8h, v24.8h
-; VBITS_EQ_128-NEXT: smull v24.4s, v26.4h, v24.4h
-; VBITS_EQ_128-NEXT: smull2 v20.4s, v23.8h, v25.8h
-; VBITS_EQ_128-NEXT: smull v23.4s, v23.4h, v25.4h
-; VBITS_EQ_128-NEXT: ldp q28, q25, [x0, #32]
-; VBITS_EQ_128-NEXT: ldp q26, q27, [x1, #32]
-; VBITS_EQ_128-NEXT: smull2 v15.4s, v28.8h, v26.8h
-; VBITS_EQ_128-NEXT: smull v1.4s, v28.4h, v26.4h
-; VBITS_EQ_128-NEXT: smull2 v14.4s, v25.8h, v27.8h
-; VBITS_EQ_128-NEXT: smull v8.4s, v25.4h, v27.4h
-; VBITS_EQ_128-NEXT: ldp q0, q27, [x0]
-; VBITS_EQ_128-NEXT: shrn v8.4h, v8.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v8.8h, v14.4s, #16
-; VBITS_EQ_128-NEXT: ldp q28, q29, [x1]
-; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #128]
-; VBITS_EQ_128-NEXT: shrn v2.4h, v23.4s, #16
-; VBITS_EQ_128-NEXT: stp q9, q30, [x0, #160]
-; VBITS_EQ_128-NEXT: shrn v3.4h, v24.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v2.8h, v20.4s, #16
-; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT: smull2 v26.4s, v0.8h, v28.8h
-; VBITS_EQ_128-NEXT: shrn2 v3.8h, v22.4s, #16
-; VBITS_EQ_128-NEXT: smull v28.4s, v0.4h, v28.4h
-; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #224]
-; VBITS_EQ_128-NEXT: smull2 v25.4s, v27.8h, v29.8h
-; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #64]
-; VBITS_EQ_128-NEXT: smull v27.4s, v27.4h, v29.4h
-; VBITS_EQ_128-NEXT: shrn v29.4h, v1.4s, #16
-; VBITS_EQ_128-NEXT: shrn v0.4h, v13.4s, #16
-; VBITS_EQ_128-NEXT: shrn v1.4h, v11.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v29.8h, v15.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v0.8h, v12.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v1.8h, v10.4s, #16
-; VBITS_EQ_128-NEXT: stp q29, q8, [x0, #32]
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: shrn v0.4h, v27.4s, #16
-; VBITS_EQ_128-NEXT: shrn v1.4h, v28.4s, #16
-; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: shrn2 v0.8h, v25.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v1.8h, v26.4s, #16
-; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: stp q1, q0, [x0]
-; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add sp, sp, #96
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: smulh_v128i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: smulh_v128i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%1 = sext <128 x i16> %op1 to <128 x i32>
@@ -705,7 +341,7 @@ define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Vector i64 multiplications are not legal for NEON so use SVE when available.
-define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smulh_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
@@ -714,8 +350,6 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: shrn v0.2s, v0.2d, #32
; CHECK-NEXT: ret
-
-
%1 = sext <2 x i32> %op1 to <2 x i64>
%2 = sext <2 x i32> %op2 to <2 x i64>
%mul = mul <2 x i64> %1, %2
@@ -725,7 +359,7 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smulh_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
@@ -740,39 +374,15 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
ret <4 x i32> %res
}
-define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v8i32:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT: sshll v5.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT: sshll v4.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v1.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT: sshll v7.2d, v2.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v2.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT: sshll v6.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z7.d
-; VBITS_EQ_128-NEXT: sshll2 v3.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d
-; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z6.d
-; VBITS_EQ_128-NEXT: shrn v5.2s, v5.2d, #32
-; VBITS_EQ_128-NEXT: shrn v2.2s, v4.2d, #32
-; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z3.d
-; VBITS_EQ_128-NEXT: shrn2 v5.4s, v0.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v2.4s, v1.2d, #32
-; VBITS_EQ_128-NEXT: stp q5, q2, [x0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: smulh_v8i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
+define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%1 = sext <8 x i32> %op1 to <8 x i64>
@@ -785,57 +395,47 @@ define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v16i32:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q1, q2, [x0, #32]
-; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT: sshll v19.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v1.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT: ldp q3, q4, [x0]
-; VBITS_EQ_128-NEXT: sshll v18.2d, v2.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v2.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT: sshll v7.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v3.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #32]
-; VBITS_EQ_128-NEXT: sshll v0.2d, v4.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v4.2d, v4.4s, #0
-; VBITS_EQ_128-NEXT: sshll2 v21.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT: sshll v5.2d, v5.2s, #0
-; VBITS_EQ_128-NEXT: ldp q16, q17, [x1]
-; VBITS_EQ_128-NEXT: sshll2 v22.2d, v6.4s, #0
-; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z19.d
-; VBITS_EQ_128-NEXT: sshll v6.2d, v6.2s, #0
-; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z21.d
-; VBITS_EQ_128-NEXT: shrn v5.2s, v5.2d, #32
-; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z22.d
-; VBITS_EQ_128-NEXT: sshll v19.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z18.d
-; VBITS_EQ_128-NEXT: sshll2 v16.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT: sshll v20.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z19.d
-; VBITS_EQ_128-NEXT: sshll2 v17.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT: mul z3.d, p0/m, z3.d, z16.d
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z20.d
-; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT: shrn v7.2s, v7.2d, #32
-; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32
-; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z17.d
-; VBITS_EQ_128-NEXT: shrn2 v5.4s, v1.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v6.4s, v2.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v7.4s, v3.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v0.4s, v4.2d, #32
-; VBITS_EQ_128-NEXT: stp q5, q6, [x0, #32]
-; VBITS_EQ_128-NEXT: stp q7, q0, [x0]
-; VBITS_EQ_128-NEXT: ret
+; VBITS_GE_256-LABEL: smulh_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ptrue p1.s, vl4
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov z7.d, z1.d
+; VBITS_GE_256-NEXT: mov z16.d, z3.d
+; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16
+; VBITS_GE_256-NEXT: smull2 v4.2d, v0.4s, v2.4s
+; VBITS_GE_256-NEXT: ext z16.b, z16.b, z3.b, #16
+; VBITS_GE_256-NEXT: smull v5.2d, v0.2s, v2.2s
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT: smull2 v6.2d, v1.4s, v3.4s
+; VBITS_GE_256-NEXT: smull v1.2d, v1.2s, v3.2s
+; VBITS_GE_256-NEXT: smull2 v3.2d, v0.4s, v2.4s
+; VBITS_GE_256-NEXT: smull v0.2d, v0.2s, v2.2s
+; VBITS_GE_256-NEXT: smull2 v2.2d, v7.4s, v16.4s
+; VBITS_GE_256-NEXT: smull v7.2d, v7.2s, v16.2s
+; VBITS_GE_256-NEXT: uzp2 v4.4s, v5.4s, v4.4s
+; VBITS_GE_256-NEXT: uzp2 v1.4s, v1.4s, v6.4s
+; VBITS_GE_256-NEXT: uzp2 v0.4s, v0.4s, v3.4s
+; VBITS_GE_256-NEXT: uzp2 v2.4s, v7.4s, v2.4s
+; VBITS_GE_256-NEXT: splice z4.s, p1, z4.s, z0.s
+; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z2.s
+; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; VBITS_GE_1024-LABEL: smulh_v16i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+; VBITS_GE_512-LABEL: smulh_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%1 = sext <16 x i32> %op1 to <16 x i64>
@@ -847,104 +447,15 @@ define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v32i32:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 32
-; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_offset b8, -8
-; VBITS_EQ_128-NEXT: .cfi_offset b9, -16
-; VBITS_EQ_128-NEXT: .cfi_offset b10, -32
-; VBITS_EQ_128-NEXT: ldp q17, q16, [x0, #64]
-; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT: sshll v27.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v29.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT: ldp q23, q28, [x0, #96]
-; VBITS_EQ_128-NEXT: sshll v19.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v22.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT: sshll v31.2d, v23.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v8.2d, v23.4s, #0
-; VBITS_EQ_128-NEXT: ldp q26, q25, [x1, #96]
-; VBITS_EQ_128-NEXT: sshll v30.2d, v28.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v28.2d, v28.4s, #0
-; VBITS_EQ_128-NEXT: sshll2 v9.2d, v26.4s, #0
-; VBITS_EQ_128-NEXT: sshll v26.2d, v26.2s, #0
-; VBITS_EQ_128-NEXT: ldp q24, q21, [x1, #64]
-; VBITS_EQ_128-NEXT: mul z26.d, p0/m, z26.d, z31.d
-; VBITS_EQ_128-NEXT: mul z8.d, p0/m, z8.d, z9.d
-; VBITS_EQ_128-NEXT: sshll2 v10.2d, v25.4s, #0
-; VBITS_EQ_128-NEXT: sshll v25.2d, v25.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v31.2d, v24.4s, #0
-; VBITS_EQ_128-NEXT: mul z28.d, p0/m, z28.d, z10.d
-; VBITS_EQ_128-NEXT: sshll v24.2d, v24.2s, #0
-; VBITS_EQ_128-NEXT: mul z25.d, p0/m, z25.d, z30.d
-; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #32]
-; VBITS_EQ_128-NEXT: mul z24.d, p0/m, z24.d, z27.d
-; VBITS_EQ_128-NEXT: mul z29.d, p0/m, z29.d, z31.d
-; VBITS_EQ_128-NEXT: sshll2 v30.2d, v21.4s, #0
-; VBITS_EQ_128-NEXT: sshll v21.2d, v21.2s, #0
-; VBITS_EQ_128-NEXT: sshll v6.2d, v7.2s, #0
-; VBITS_EQ_128-NEXT: mul z22.d, p0/m, z22.d, z30.d
-; VBITS_EQ_128-NEXT: mul z19.d, p0/m, z19.d, z21.d
-; VBITS_EQ_128-NEXT: ldp q20, q18, [x1, #32]
-; VBITS_EQ_128-NEXT: sshll v4.2d, v5.2s, #0
-; VBITS_EQ_128-NEXT: shrn v19.2s, v19.2d, #32
-; VBITS_EQ_128-NEXT: sshll2 v5.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT: sshll2 v7.2d, v7.4s, #0
-; VBITS_EQ_128-NEXT: sshll2 v27.2d, v20.4s, #0
-; VBITS_EQ_128-NEXT: sshll v20.2d, v20.2s, #0
-; VBITS_EQ_128-NEXT: ldp q3, q1, [x0]
-; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z20.d
-; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z27.d
-; VBITS_EQ_128-NEXT: sshll2 v21.2d, v18.4s, #0
-; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT: sshll v18.2d, v18.2s, #0
-; VBITS_EQ_128-NEXT: sshll v2.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z21.d
-; VBITS_EQ_128-NEXT: sshll2 v3.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z18.d
-; VBITS_EQ_128-NEXT: ldp q16, q17, [x1]
-; VBITS_EQ_128-NEXT: sshll v0.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT: shrn v4.2s, v4.2d, #32
-; VBITS_EQ_128-NEXT: sshll2 v1.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT: shrn v18.2s, v24.2d, #32
-; VBITS_EQ_128-NEXT: sshll v20.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT: shrn2 v19.4s, v22.2d, #32
-; VBITS_EQ_128-NEXT: sshll2 v16.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT: sshll v23.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z20.d
-; VBITS_EQ_128-NEXT: sshll2 v17.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT: mul z3.d, p0/m, z3.d, z16.d
-; VBITS_EQ_128-NEXT: shrn v16.2s, v26.2d, #32
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z23.d
-; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z17.d
-; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32
-; VBITS_EQ_128-NEXT: shrn v2.2s, v2.2d, #32
-; VBITS_EQ_128-NEXT: shrn v17.2s, v25.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v16.4s, v8.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v18.4s, v29.2d, #32
-; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: shrn2 v17.4s, v28.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v6.4s, v7.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v4.4s, v5.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v2.4s, v3.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v0.4s, v1.2d, #32
-; VBITS_EQ_128-NEXT: stp q18, q19, [x0, #64]
-; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #32]
-; VBITS_EQ_128-NEXT: stp q2, q0, [x0]
-; VBITS_EQ_128-NEXT: stp q16, q17, [x0, #96]
-; VBITS_EQ_128-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: smulh_v32i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: smulh_v32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%1 = sext <32 x i32> %op1 to <32 x i64>
@@ -956,276 +467,15 @@ define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v64i32:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 80
-; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_offset w29, -16
-; VBITS_EQ_128-NEXT: .cfi_offset b8, -24
-; VBITS_EQ_128-NEXT: .cfi_offset b9, -32
-; VBITS_EQ_128-NEXT: .cfi_offset b10, -40
-; VBITS_EQ_128-NEXT: .cfi_offset b11, -48
-; VBITS_EQ_128-NEXT: .cfi_offset b12, -56
-; VBITS_EQ_128-NEXT: .cfi_offset b13, -64
-; VBITS_EQ_128-NEXT: .cfi_offset b14, -72
-; VBITS_EQ_128-NEXT: .cfi_offset b15, -80
-; VBITS_EQ_128-NEXT: addvl sp, sp, #-12
-; VBITS_EQ_128-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 96 * VG
-; VBITS_EQ_128-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xa0, 0x01, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 160 + 96 * VG
-; VBITS_EQ_128-NEXT: ldp q4, q5, [x0, #96]
-; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT: stp q5, q4, [sp, #-80]! // 32-byte Folded Spill
-; VBITS_EQ_128-NEXT: ldp q0, q2, [x0, #48]
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ldr q1, [x0, #32]
-; VBITS_EQ_128-NEXT: ldr q3, [x0, #80]
-; VBITS_EQ_128-NEXT: str q1, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: sshll v1.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT: stp q3, q2, [sp, #32] // 32-byte Folded Spill
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT: str z1, [x8, #11, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: str z0, [x8, #10, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT: str z0, [x8, #9, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT: ldp q23, q26, [x0, #128]
-; VBITS_EQ_128-NEXT: str z0, [x8, #8, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v4.4s, #0
-; VBITS_EQ_128-NEXT: str z0, [x8, #7, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT: ldp q25, q24, [x0, #160]
-; VBITS_EQ_128-NEXT: str z0, [x8, #6, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v23.4s, #0
-; VBITS_EQ_128-NEXT: sshll2 v1.2d, v26.4s, #0
-; VBITS_EQ_128-NEXT: str z0, [x8, #5, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll2 v27.2d, v25.4s, #0
-; VBITS_EQ_128-NEXT: ldp q30, q0, [x0, #192]
-; VBITS_EQ_128-NEXT: str z1, [x8, #4, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll2 v9.2d, v24.4s, #0
-; VBITS_EQ_128-NEXT: sshll2 v12.2d, v30.4s, #0
-; VBITS_EQ_128-NEXT: ldp q31, q1, [x0, #224]
-; VBITS_EQ_128-NEXT: sshll v11.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v8.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT: sshll v10.2d, v31.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v15.2d, v31.4s, #0
-; VBITS_EQ_128-NEXT: ldp q29, q28, [x1, #224]
-; VBITS_EQ_128-NEXT: sshll2 v18.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT: sshll v31.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v2.2d, v29.4s, #0
-; VBITS_EQ_128-NEXT: ldp q14, q0, [x1, #192]
-; VBITS_EQ_128-NEXT: sshll v1.2d, v28.2s, #0
-; VBITS_EQ_128-NEXT: sshll v20.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v19.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v28.4s, #0
-; VBITS_EQ_128-NEXT: mul z11.d, p0/m, z11.d, z20.d
-; VBITS_EQ_128-NEXT: ldp q21, q22, [x0]
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z18.d
-; VBITS_EQ_128-NEXT: sshll v18.2d, v29.2s, #0
-; VBITS_EQ_128-NEXT: sshll v20.2d, v14.2s, #0
-; VBITS_EQ_128-NEXT: ldp q4, q13, [x1, #160]
-; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #128]
-; VBITS_EQ_128-NEXT: ldp q7, q3, [x1, #96]
-; VBITS_EQ_128-NEXT: str z0, [x8, #3, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ldp q17, q16, [x1, #64]
-; VBITS_EQ_128-NEXT: movprfx z0, z31
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: movprfx z0, z15
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d
-; VBITS_EQ_128-NEXT: sshll v1.2d, v30.2s, #0
-; VBITS_EQ_128-NEXT: str z0, [x8, #2, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ldp q2, q29, [x1, #32]
-; VBITS_EQ_128-NEXT: movprfx z15, z10
-; VBITS_EQ_128-NEXT: mul z15.d, p0/m, z15.d, z18.d
-; VBITS_EQ_128-NEXT: movprfx z0, z8
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z19.d
-; VBITS_EQ_128-NEXT: str z0, [x8] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v14.4s, #0
-; VBITS_EQ_128-NEXT: ldp q19, q18, [x1]
-; VBITS_EQ_128-NEXT: movprfx z10, z12
-; VBITS_EQ_128-NEXT: mul z10.d, p0/m, z10.d, z0.d
-; VBITS_EQ_128-NEXT: movprfx z8, z1
-; VBITS_EQ_128-NEXT: mul z8.d, p0/m, z8.d, z20.d
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v13.4s, #0
-; VBITS_EQ_128-NEXT: sshll v12.2d, v24.2s, #0
-; VBITS_EQ_128-NEXT: sshll v1.2d, v13.2s, #0
-; VBITS_EQ_128-NEXT: mul z9.d, p0/m, z9.d, z0.d
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v4.4s, #0
-; VBITS_EQ_128-NEXT: mul z12.d, p0/m, z12.d, z1.d
-; VBITS_EQ_128-NEXT: sshll v1.2d, v4.2s, #0
-; VBITS_EQ_128-NEXT: mul z27.d, p0/m, z27.d, z0.d
-; VBITS_EQ_128-NEXT: sshll v20.2d, v25.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z13, z20
-; VBITS_EQ_128-NEXT: mul z13.d, p0/m, z13.d, z1.d
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v6.4s, #0
-; VBITS_EQ_128-NEXT: sshll v1.2d, v6.2s, #0
-; VBITS_EQ_128-NEXT: ldr z6, [x8, #4, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: movprfx z14, z6
-; VBITS_EQ_128-NEXT: mul z14.d, p0/m, z14.d, z0.d
-; VBITS_EQ_128-NEXT: sshll v4.2d, v26.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z30, z4
-; VBITS_EQ_128-NEXT: mul z30.d, p0/m, z30.d, z1.d
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT: ldr z4, [x8, #5, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll v1.2d, v5.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z31, z4
-; VBITS_EQ_128-NEXT: mul z31.d, p0/m, z31.d, z0.d
-; VBITS_EQ_128-NEXT: sshll v6.2d, v23.2s, #0
-; VBITS_EQ_128-NEXT: ldr q4, [sp] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT: movprfx z28, z6
-; VBITS_EQ_128-NEXT: mul z28.d, p0/m, z28.d, z1.d
-; VBITS_EQ_128-NEXT: sshll v1.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT: ldr z3, [x8, #6, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: movprfx z23, z3
-; VBITS_EQ_128-NEXT: mul z23.d, p0/m, z23.d, z0.d
-; VBITS_EQ_128-NEXT: sshll v5.2d, v4.2s, #0
-; VBITS_EQ_128-NEXT: ldr q3, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: movprfx z20, z5
-; VBITS_EQ_128-NEXT: mul z20.d, p0/m, z20.d, z1.d
-; VBITS_EQ_128-NEXT: ldr z1, [x8, #7, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v7.4s, #0
-; VBITS_EQ_128-NEXT: sshll v4.2d, v7.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z7, z1
-; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z0.d
-; VBITS_EQ_128-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: sshll v3.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z6, z3
-; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z4.d
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT: sshll v5.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT: ldr z1, [x8, #8, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: movprfx z26, z1
-; VBITS_EQ_128-NEXT: mul z26.d, p0/m, z26.d, z0.d
-; VBITS_EQ_128-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll v3.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z24, z5
-; VBITS_EQ_128-NEXT: mul z24.d, p0/m, z24.d, z3.d
-; VBITS_EQ_128-NEXT: sshll v16.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT: ldr z1, [x8, #9, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT: movprfx z25, z1
-; VBITS_EQ_128-NEXT: mul z25.d, p0/m, z25.d, z0.d
-; VBITS_EQ_128-NEXT: sshll v5.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v29.4s, #0
-; VBITS_EQ_128-NEXT: sshll v17.2d, v29.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z29, z16
-; VBITS_EQ_128-NEXT: mul z29.d, p0/m, z29.d, z5.d
-; VBITS_EQ_128-NEXT: ldr z1, [x8, #10, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: movprfx z4, z1
-; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z0.d
-; VBITS_EQ_128-NEXT: sshll v5.2d, v22.2s, #0
-; VBITS_EQ_128-NEXT: ldr z0, [x8, #11, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: sshll2 v16.2d, v22.4s, #0
-; VBITS_EQ_128-NEXT: movprfx z22, z0
-; VBITS_EQ_128-NEXT: mul z22.d, p0/m, z22.d, z17.d
-; VBITS_EQ_128-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: sshll v1.2d, v2.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v2.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT: sshll v17.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT: sshll2 v0.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT: sshll v3.2d, v18.2s, #0
-; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z17.d
-; VBITS_EQ_128-NEXT: sshll2 v18.2d, v18.4s, #0
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d
-; VBITS_EQ_128-NEXT: movprfx z2, z5
-; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z3.d
-; VBITS_EQ_128-NEXT: mul z18.d, p0/m, z18.d, z16.d
-; VBITS_EQ_128-NEXT: sshll2 v5.2d, v21.4s, #0
-; VBITS_EQ_128-NEXT: sshll2 v16.2d, v19.4s, #0
-; VBITS_EQ_128-NEXT: sshll v17.2d, v19.2s, #0
-; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z16.d
-; VBITS_EQ_128-NEXT: shrn v16.2s, v1.2d, #32
-; VBITS_EQ_128-NEXT: sshll v3.2d, v21.2s, #0
-; VBITS_EQ_128-NEXT: shrn v21.2s, v22.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v16.4s, v0.2d, #32
-; VBITS_EQ_128-NEXT: shrn v0.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT: ldr z6, [x8, #1, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: shrn v1.2s, v20.2d, #32
-; VBITS_EQ_128-NEXT: mul z17.d, p0/m, z17.d, z3.d
-; VBITS_EQ_128-NEXT: shrn2 v21.4s, v4.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v0.4s, v7.2d, #32
-; VBITS_EQ_128-NEXT: shrn v3.2s, v13.2d, #32
-; VBITS_EQ_128-NEXT: ldr z19, [x8, #3, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: shrn v4.2s, v12.2d, #32
-; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT: shrn v7.2s, v15.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v1.4s, v23.2d, #32
-; VBITS_EQ_128-NEXT: ldr z20, [x8, #2, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: shrn2 v3.4s, v27.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v4.4s, v9.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v6.4s, v19.2d, #32
-; VBITS_EQ_128-NEXT: shrn v19.2s, v11.2d, #32
-; VBITS_EQ_128-NEXT: ldr z22, [x8] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: stp q16, q21, [x0, #32]
-; VBITS_EQ_128-NEXT: shrn2 v7.4s, v20.2d, #32
-; VBITS_EQ_128-NEXT: shrn v20.2s, v8.2d, #32
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: shrn v0.2s, v2.2d, #32
-; VBITS_EQ_128-NEXT: stp q3, q4, [x0, #160]
-; VBITS_EQ_128-NEXT: shrn v3.2s, v24.2d, #32
-; VBITS_EQ_128-NEXT: stp q7, q6, [x0, #224]
-; VBITS_EQ_128-NEXT: shrn v6.2s, v30.2d, #32
-; VBITS_EQ_128-NEXT: shrn v7.2s, v28.2d, #32
-; VBITS_EQ_128-NEXT: shrn v4.2s, v29.2d, #32
-; VBITS_EQ_128-NEXT: shrn v1.2s, v17.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v19.4s, v22.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v20.4s, v10.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v6.4s, v14.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v7.4s, v31.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v3.4s, v26.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v4.4s, v25.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v0.4s, v18.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v1.4s, v5.2d, #32
-; VBITS_EQ_128-NEXT: stp q7, q6, [x0, #128]
-; VBITS_EQ_128-NEXT: stp q4, q3, [x0, #64]
-; VBITS_EQ_128-NEXT: stp q1, q0, [x0]
-; VBITS_EQ_128-NEXT: stp q20, q19, [x0, #192]
-; VBITS_EQ_128-NEXT: addvl sp, sp, #12
-; VBITS_EQ_128-NEXT: add sp, sp, #80
-; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: smulh_v64i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: smulh_v64i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%1 = sext <64 x i32> %op1 to <64 x i64>
@@ -1238,25 +488,15 @@ define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Vector i64 multiplications are not legal for NEON so use SVE when available.
-define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
-; VBITS_EQ_128-LABEL: smulh_v1i64:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: // kill: def $d1 killed $d1 def $q1
-; VBITS_EQ_128-NEXT: // kill: def $d0 killed $d0 def $q0
-; VBITS_EQ_128-NEXT: fmov x8, d0
-; VBITS_EQ_128-NEXT: fmov x9, d1
-; VBITS_EQ_128-NEXT: smulh x8, x8, x9
-; VBITS_EQ_128-NEXT: fmov d0, x8
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: smulh_v1i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
-; VBITS_GE_256-NEXT: ptrue p0.d, vl1
-; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
-; VBITS_GE_256-NEXT: smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
-; VBITS_GE_256-NEXT: ret
+define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%insert = insertelement <1 x i128> undef, i128 64, i128 0
%splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
%1 = sext <1 x i64> %op1 to <1 x i128>
@@ -1268,28 +508,15 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
}
; Vector i64 multiplications are not legal for NEON so use SVE when available.
-define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
-; VBITS_EQ_128-LABEL: smulh_v2i64:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: mov x8, v0.d[1]
-; VBITS_EQ_128-NEXT: fmov x10, d0
-; VBITS_EQ_128-NEXT: mov x9, v1.d[1]
-; VBITS_EQ_128-NEXT: fmov x11, d1
-; VBITS_EQ_128-NEXT: smulh x10, x10, x11
-; VBITS_EQ_128-NEXT: smulh x8, x8, x9
-; VBITS_EQ_128-NEXT: fmov d0, x10
-; VBITS_EQ_128-NEXT: fmov d1, x8
-; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: smulh_v2i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_256-NEXT: ptrue p0.d, vl2
-; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_256-NEXT: smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
-; VBITS_GE_256-NEXT: ret
+define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%1 = sext <2 x i64> %op1 to <2 x i128>
%2 = sext <2 x i64> %op2 to <2 x i128>
%mul = mul <2 x i128> %1, %2
@@ -1298,40 +525,15 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
ret <2 x i64> %res
}
-define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v4i64:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT: mov x10, v0.d[1]
-; VBITS_EQ_128-NEXT: fmov x11, d0
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT: mov x8, v1.d[1]
-; VBITS_EQ_128-NEXT: fmov x9, d1
-; VBITS_EQ_128-NEXT: mov x12, v2.d[1]
-; VBITS_EQ_128-NEXT: fmov x13, d2
-; VBITS_EQ_128-NEXT: mov x14, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x15, d3
-; VBITS_EQ_128-NEXT: smulh x11, x11, x13
-; VBITS_EQ_128-NEXT: smulh x10, x10, x12
-; VBITS_EQ_128-NEXT: smulh x9, x9, x15
-; VBITS_EQ_128-NEXT: smulh x8, x8, x14
-; VBITS_EQ_128-NEXT: fmov d0, x11
-; VBITS_EQ_128-NEXT: fmov d1, x10
-; VBITS_EQ_128-NEXT: fmov d2, x9
-; VBITS_EQ_128-NEXT: fmov d3, x8
-; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0]
-; VBITS_EQ_128-NEXT: mov v2.d[1], v3.d[0]
-; VBITS_EQ_128-NEXT: stp q0, q2, [x0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: smulh_v4i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
+define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%1 = sext <4 x i64> %op1 to <4 x i128>
@@ -1344,60 +546,69 @@ define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @smulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v8i64:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT: fmov x14, d0
-; VBITS_EQ_128-NEXT: mov x13, v0.d[1]
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x0]
-; VBITS_EQ_128-NEXT: mov x11, v1.d[1]
-; VBITS_EQ_128-NEXT: fmov x12, d1
-; VBITS_EQ_128-NEXT: mov x10, v2.d[1]
-; VBITS_EQ_128-NEXT: ldp q4, q5, [x1, #32]
-; VBITS_EQ_128-NEXT: mov x8, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x9, d3
-; VBITS_EQ_128-NEXT: fmov x17, d4
-; VBITS_EQ_128-NEXT: mov x15, v4.d[1]
-; VBITS_EQ_128-NEXT: ldp q3, q1, [x1]
-; VBITS_EQ_128-NEXT: fmov x1, d5
-; VBITS_EQ_128-NEXT: smulh x14, x14, x17
-; VBITS_EQ_128-NEXT: mov x18, v5.d[1]
-; VBITS_EQ_128-NEXT: smulh x13, x13, x15
-; VBITS_EQ_128-NEXT: fmov x15, d2
-; VBITS_EQ_128-NEXT: smulh x12, x12, x1
-; VBITS_EQ_128-NEXT: mov x1, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x17, d1
-; VBITS_EQ_128-NEXT: smulh x11, x11, x18
-; VBITS_EQ_128-NEXT: mov x16, v1.d[1]
-; VBITS_EQ_128-NEXT: fmov d2, x13
-; VBITS_EQ_128-NEXT: fmov d5, x12
-; VBITS_EQ_128-NEXT: smulh x9, x9, x17
-; VBITS_EQ_128-NEXT: fmov x17, d3
-; VBITS_EQ_128-NEXT: smulh x10, x10, x1
-; VBITS_EQ_128-NEXT: fmov d3, x14
-; VBITS_EQ_128-NEXT: smulh x8, x8, x16
-; VBITS_EQ_128-NEXT: fmov d4, x11
-; VBITS_EQ_128-NEXT: smulh x15, x15, x17
-; VBITS_EQ_128-NEXT: fmov d1, x9
-; VBITS_EQ_128-NEXT: fmov d6, x10
-; VBITS_EQ_128-NEXT: fmov d0, x8
-; VBITS_EQ_128-NEXT: fmov d7, x15
-; VBITS_EQ_128-NEXT: mov v3.d[1], v2.d[0]
-; VBITS_EQ_128-NEXT: mov v5.d[1], v4.d[0]
-; VBITS_EQ_128-NEXT: mov v7.d[1], v6.d[0]
-; VBITS_EQ_128-NEXT: mov v1.d[1], v0.d[0]
-; VBITS_EQ_128-NEXT: stp q3, q5, [x0, #32]
-; VBITS_EQ_128-NEXT: stp q7, q1, [x0]
-; VBITS_EQ_128-NEXT: ret
+; VBITS_GE_256-LABEL: smulh_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ptrue p1.d, vl2
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov x9, v0.d[1]
+; VBITS_GE_256-NEXT: fmov x10, d0
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: fmov x17, d2
+; VBITS_GE_256-NEXT: mov x13, v2.d[1]
+; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT: mov x14, v0.d[1]
+; VBITS_GE_256-NEXT: mov x18, v2.d[1]
+; VBITS_GE_256-NEXT: smulh x10, x10, x17
+; VBITS_GE_256-NEXT: mov x11, v1.d[1]
+; VBITS_GE_256-NEXT: fmov x12, d1
+; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT: mov x2, v3.d[1]
+; VBITS_GE_256-NEXT: fmov x3, d3
+; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
+; VBITS_GE_256-NEXT: smulh x9, x9, x13
+; VBITS_GE_256-NEXT: mov x13, v1.d[1]
+; VBITS_GE_256-NEXT: smulh x14, x14, x18
+; VBITS_GE_256-NEXT: mov x18, v3.d[1]
+; VBITS_GE_256-NEXT: smulh x12, x12, x3
+; VBITS_GE_256-NEXT: fmov x15, d0
+; VBITS_GE_256-NEXT: fmov x16, d1
+; VBITS_GE_256-NEXT: fmov x1, d2
+; VBITS_GE_256-NEXT: fmov x17, d3
+; VBITS_GE_256-NEXT: fmov d0, x9
+; VBITS_GE_256-NEXT: fmov d1, x10
+; VBITS_GE_256-NEXT: smulh x9, x11, x2
+; VBITS_GE_256-NEXT: smulh x15, x15, x1
+; VBITS_GE_256-NEXT: fmov d4, x12
+; VBITS_GE_256-NEXT: smulh x16, x16, x17
+; VBITS_GE_256-NEXT: smulh x10, x13, x18
+; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT: fmov d0, x14
+; VBITS_GE_256-NEXT: fmov d2, x15
+; VBITS_GE_256-NEXT: fmov d3, x9
+; VBITS_GE_256-NEXT: fmov d6, x16
+; VBITS_GE_256-NEXT: fmov d5, x10
+; VBITS_GE_256-NEXT: mov v2.d[1], v0.d[0]
+; VBITS_GE_256-NEXT: mov v4.d[1], v3.d[0]
+; VBITS_GE_256-NEXT: mov v6.d[1], v5.d[0]
+; VBITS_GE_256-NEXT: splice z1.d, p1, z1.d, z2.d
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: splice z4.d, p1, z4.d, z6.d
+; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; VBITS_GE_1024-LABEL: smulh_v8i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl8
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+; VBITS_GE_512-LABEL: smulh_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smulh z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%1 = sext <8 x i64> %op1 to <8 x i128>
@@ -1409,111 +620,15 @@ define void @smulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v16i64:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: str x21, [sp, #-32]! // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 32
-; VBITS_EQ_128-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_offset w19, -8
-; VBITS_EQ_128-NEXT: .cfi_offset w20, -16
-; VBITS_EQ_128-NEXT: .cfi_offset w21, -32
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x0]
-; VBITS_EQ_128-NEXT: mov x10, v2.d[1]
-; VBITS_EQ_128-NEXT: fmov x11, d2
-; VBITS_EQ_128-NEXT: ldp q4, q5, [x0, #32]
-; VBITS_EQ_128-NEXT: mov x8, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x9, d3
-; VBITS_EQ_128-NEXT: mov x14, v4.d[1]
-; VBITS_EQ_128-NEXT: fmov x15, d4
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: mov x12, v5.d[1]
-; VBITS_EQ_128-NEXT: fmov x13, d5
-; VBITS_EQ_128-NEXT: fmov x5, d0
-; VBITS_EQ_128-NEXT: mov x4, v0.d[1]
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x0, #64]
-; VBITS_EQ_128-NEXT: mov x3, v1.d[1]
-; VBITS_EQ_128-NEXT: mov x18, v2.d[1]
-; VBITS_EQ_128-NEXT: fmov x2, d2
-; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #96]
-; VBITS_EQ_128-NEXT: mov x16, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x17, d3
-; VBITS_EQ_128-NEXT: fmov x19, d5
-; VBITS_EQ_128-NEXT: mov x6, v5.d[1]
-; VBITS_EQ_128-NEXT: ldp q4, q7, [x1, #64]
-; VBITS_EQ_128-NEXT: mov x20, v6.d[1]
-; VBITS_EQ_128-NEXT: fmov x21, d6
-; VBITS_EQ_128-NEXT: smulh x5, x5, x19
-; VBITS_EQ_128-NEXT: smulh x4, x4, x6
-; VBITS_EQ_128-NEXT: mov x19, v4.d[1]
-; VBITS_EQ_128-NEXT: fmov x6, d4
-; VBITS_EQ_128-NEXT: smulh x3, x3, x20
-; VBITS_EQ_128-NEXT: ldp q3, q16, [x1, #32]
-; VBITS_EQ_128-NEXT: fmov x20, d7
-; VBITS_EQ_128-NEXT: smulh x2, x2, x6
-; VBITS_EQ_128-NEXT: smulh x18, x18, x19
-; VBITS_EQ_128-NEXT: fmov d18, x4
-; VBITS_EQ_128-NEXT: fmov d19, x5
-; VBITS_EQ_128-NEXT: fmov d20, x3
-; VBITS_EQ_128-NEXT: smulh x17, x17, x20
-; VBITS_EQ_128-NEXT: fmov x19, d3
-; VBITS_EQ_128-NEXT: fmov d23, x2
-; VBITS_EQ_128-NEXT: ldp q2, q17, [x1]
-; VBITS_EQ_128-NEXT: fmov x1, d1
-; VBITS_EQ_128-NEXT: fmov x20, d16
-; VBITS_EQ_128-NEXT: smulh x15, x15, x19
-; VBITS_EQ_128-NEXT: fmov d22, x18
-; VBITS_EQ_128-NEXT: mov v19.d[1], v18.d[0]
-; VBITS_EQ_128-NEXT: smulh x1, x1, x21
-; VBITS_EQ_128-NEXT: mov x21, v7.d[1]
-; VBITS_EQ_128-NEXT: smulh x13, x13, x20
-; VBITS_EQ_128-NEXT: mov x7, v17.d[1]
-; VBITS_EQ_128-NEXT: mov x6, v2.d[1]
-; VBITS_EQ_128-NEXT: mov x20, v16.d[1]
-; VBITS_EQ_128-NEXT: smulh x16, x16, x21
-; VBITS_EQ_128-NEXT: fmov x21, d2
-; VBITS_EQ_128-NEXT: fmov x19, d17
-; VBITS_EQ_128-NEXT: smulh x8, x8, x7
-; VBITS_EQ_128-NEXT: smulh x10, x10, x6
-; VBITS_EQ_128-NEXT: fmov d5, x13
-; VBITS_EQ_128-NEXT: smulh x11, x11, x21
-; VBITS_EQ_128-NEXT: fmov d7, x15
-; VBITS_EQ_128-NEXT: mov x21, v3.d[1]
-; VBITS_EQ_128-NEXT: smulh x9, x9, x19
-; VBITS_EQ_128-NEXT: smulh x12, x12, x20
-; VBITS_EQ_128-NEXT: fmov d0, x8
-; VBITS_EQ_128-NEXT: fmov d2, x10
-; VBITS_EQ_128-NEXT: fmov d16, x16
-; VBITS_EQ_128-NEXT: fmov d3, x11
-; VBITS_EQ_128-NEXT: fmov d17, x17
-; VBITS_EQ_128-NEXT: smulh x14, x14, x21
-; VBITS_EQ_128-NEXT: fmov d1, x9
-; VBITS_EQ_128-NEXT: fmov d4, x12
-; VBITS_EQ_128-NEXT: fmov d21, x1
-; VBITS_EQ_128-NEXT: mov v23.d[1], v22.d[0]
-; VBITS_EQ_128-NEXT: mov v17.d[1], v16.d[0]
-; VBITS_EQ_128-NEXT: fmov d6, x14
-; VBITS_EQ_128-NEXT: mov v21.d[1], v20.d[0]
-; VBITS_EQ_128-NEXT: mov v5.d[1], v4.d[0]
-; VBITS_EQ_128-NEXT: mov v7.d[1], v6.d[0]
-; VBITS_EQ_128-NEXT: stp q23, q17, [x0, #64]
-; VBITS_EQ_128-NEXT: mov v3.d[1], v2.d[0]
-; VBITS_EQ_128-NEXT: mov v1.d[1], v0.d[0]
-; VBITS_EQ_128-NEXT: stp q19, q21, [x0, #96]
-; VBITS_EQ_128-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #32]
-; VBITS_EQ_128-NEXT: stp q3, q1, [x0]
-; VBITS_EQ_128-NEXT: ldr x21, [sp], #32 // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: smulh_v16i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: smulh_v16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%1 = sext <16 x i64> %op1 to <16 x i128>
@@ -1525,237 +640,15 @@ define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v32i64:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: sub sp, sp, #224
-; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 224
-; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #80] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #96] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #112] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp x29, x30, [sp, #128] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp x28, x27, [sp, #144] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp x26, x25, [sp, #160] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp x24, x23, [sp, #176] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp x22, x21, [sp, #192] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp x20, x19, [sp, #208] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_offset w19, -8
-; VBITS_EQ_128-NEXT: .cfi_offset w20, -16
-; VBITS_EQ_128-NEXT: .cfi_offset w21, -24
-; VBITS_EQ_128-NEXT: .cfi_offset w22, -32
-; VBITS_EQ_128-NEXT: .cfi_offset w23, -40
-; VBITS_EQ_128-NEXT: .cfi_offset w24, -48
-; VBITS_EQ_128-NEXT: .cfi_offset w25, -56
-; VBITS_EQ_128-NEXT: .cfi_offset w26, -64
-; VBITS_EQ_128-NEXT: .cfi_offset w27, -72
-; VBITS_EQ_128-NEXT: .cfi_offset w28, -80
-; VBITS_EQ_128-NEXT: .cfi_offset w30, -88
-; VBITS_EQ_128-NEXT: .cfi_offset w29, -96
-; VBITS_EQ_128-NEXT: .cfi_offset b8, -104
-; VBITS_EQ_128-NEXT: .cfi_offset b9, -112
-; VBITS_EQ_128-NEXT: .cfi_offset b10, -120
-; VBITS_EQ_128-NEXT: .cfi_offset b11, -128
-; VBITS_EQ_128-NEXT: .cfi_offset b12, -136
-; VBITS_EQ_128-NEXT: .cfi_offset b13, -144
-; VBITS_EQ_128-NEXT: .cfi_offset b14, -152
-; VBITS_EQ_128-NEXT: .cfi_offset b15, -160
-; VBITS_EQ_128-NEXT: ldp q3, q2, [x0]
-; VBITS_EQ_128-NEXT: mov x8, v3.d[1]
-; VBITS_EQ_128-NEXT: ldp q5, q4, [x0, #64]
-; VBITS_EQ_128-NEXT: fmov x2, d2
-; VBITS_EQ_128-NEXT: str x8, [sp, #16] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: fmov x8, d3
-; VBITS_EQ_128-NEXT: mov x6, v5.d[1]
-; VBITS_EQ_128-NEXT: fmov x7, d5
-; VBITS_EQ_128-NEXT: str x8, [sp] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: ldp q6, q3, [x0, #96]
-; VBITS_EQ_128-NEXT: mov x20, v4.d[1]
-; VBITS_EQ_128-NEXT: fmov x21, d4
-; VBITS_EQ_128-NEXT: mov x23, v6.d[1]
-; VBITS_EQ_128-NEXT: fmov x24, d6
-; VBITS_EQ_128-NEXT: ldp q16, q4, [x0, #128]
-; VBITS_EQ_128-NEXT: mov x26, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x27, d3
-; VBITS_EQ_128-NEXT: mov x28, v16.d[1]
-; VBITS_EQ_128-NEXT: fmov x25, d16
-; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #224]
-; VBITS_EQ_128-NEXT: mov x22, v4.d[1]
-; VBITS_EQ_128-NEXT: fmov x19, d4
-; VBITS_EQ_128-NEXT: mov x13, v7.d[1]
-; VBITS_EQ_128-NEXT: fmov x11, d7
-; VBITS_EQ_128-NEXT: ldp q17, q6, [x0, #192]
-; VBITS_EQ_128-NEXT: mov x12, v5.d[1]
-; VBITS_EQ_128-NEXT: fmov x10, d5
-; VBITS_EQ_128-NEXT: mov x17, v17.d[1]
-; VBITS_EQ_128-NEXT: fmov x16, d17
-; VBITS_EQ_128-NEXT: ldp q18, q3, [x0, #160]
-; VBITS_EQ_128-NEXT: mov x15, v6.d[1]
-; VBITS_EQ_128-NEXT: fmov x14, d6
-; VBITS_EQ_128-NEXT: mov x5, v18.d[1]
-; VBITS_EQ_128-NEXT: fmov x4, d18
-; VBITS_EQ_128-NEXT: ldp q19, q16, [x1, #224]
-; VBITS_EQ_128-NEXT: mov x29, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x18, d3
-; VBITS_EQ_128-NEXT: fmov x8, d19
-; VBITS_EQ_128-NEXT: mov x9, v19.d[1]
-; VBITS_EQ_128-NEXT: ldp q21, q20, [x1, #192]
-; VBITS_EQ_128-NEXT: mov x30, v16.d[1]
-; VBITS_EQ_128-NEXT: smulh x8, x11, x8
-; VBITS_EQ_128-NEXT: smulh x11, x13, x9
-; VBITS_EQ_128-NEXT: fmov x9, d21
-; VBITS_EQ_128-NEXT: str x8, [sp, #48] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: ldp q22, q18, [x1, #160]
-; VBITS_EQ_128-NEXT: ldp q24, q23, [x1, #128]
-; VBITS_EQ_128-NEXT: ldp q25, q17, [x1, #96]
-; VBITS_EQ_128-NEXT: ldp q26, q6, [x1, #64]
-; VBITS_EQ_128-NEXT: ldp q4, q3, [x1, #32]
-; VBITS_EQ_128-NEXT: ldp q7, q5, [x1]
-; VBITS_EQ_128-NEXT: fmov x1, d16
-; VBITS_EQ_128-NEXT: smulh x10, x10, x1
-; VBITS_EQ_128-NEXT: mov x1, v20.d[1]
-; VBITS_EQ_128-NEXT: ldp q1, q0, [x0, #32]
-; VBITS_EQ_128-NEXT: str x10, [sp, #56] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: smulh x10, x12, x30
-; VBITS_EQ_128-NEXT: mov x30, v21.d[1]
-; VBITS_EQ_128-NEXT: fmov x3, d1
-; VBITS_EQ_128-NEXT: str x10, [sp, #24] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: fmov x10, d20
-; VBITS_EQ_128-NEXT: ldr x13, [sp, #16] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d13, d11, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: smulh x8, x14, x10
-; VBITS_EQ_128-NEXT: smulh x10, x15, x1
-; VBITS_EQ_128-NEXT: fmov x15, d18
-; VBITS_EQ_128-NEXT: smulh x14, x16, x9
-; VBITS_EQ_128-NEXT: mov x9, v22.d[1]
-; VBITS_EQ_128-NEXT: smulh x16, x17, x30
-; VBITS_EQ_128-NEXT: stp x11, x8, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: fmov x17, d22
-; VBITS_EQ_128-NEXT: mov x8, v18.d[1]
-; VBITS_EQ_128-NEXT: smulh x18, x18, x15
-; VBITS_EQ_128-NEXT: mov x15, v23.d[1]
-; VBITS_EQ_128-NEXT: str x10, [sp, #8] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: smulh x4, x4, x17
-; VBITS_EQ_128-NEXT: fmov d8, x16
-; VBITS_EQ_128-NEXT: mov x17, v24.d[1]
-; VBITS_EQ_128-NEXT: smulh x5, x5, x9
-; VBITS_EQ_128-NEXT: smulh x1, x29, x8
-; VBITS_EQ_128-NEXT: fmov x8, d23
-; VBITS_EQ_128-NEXT: fmov x9, d24
-; VBITS_EQ_128-NEXT: smulh x22, x22, x15
-; VBITS_EQ_128-NEXT: fmov x15, d17
-; VBITS_EQ_128-NEXT: fmov d9, x14
-; VBITS_EQ_128-NEXT: smulh x19, x19, x8
-; VBITS_EQ_128-NEXT: ldr d14, [sp, #8] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: mov x8, v17.d[1]
-; VBITS_EQ_128-NEXT: smulh x25, x25, x9
-; VBITS_EQ_128-NEXT: mov x9, v25.d[1]
-; VBITS_EQ_128-NEXT: smulh x28, x28, x17
-; VBITS_EQ_128-NEXT: fmov x17, d25
-; VBITS_EQ_128-NEXT: smulh x15, x27, x15
-; VBITS_EQ_128-NEXT: mov x27, v6.d[1]
-; VBITS_EQ_128-NEXT: ldr d15, [sp, #40] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: smulh x12, x26, x8
-; VBITS_EQ_128-NEXT: fmov x26, d6
-; VBITS_EQ_128-NEXT: smulh x17, x24, x17
-; VBITS_EQ_128-NEXT: ldr x8, [sp] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: mov x24, v26.d[1]
-; VBITS_EQ_128-NEXT: smulh x11, x23, x9
-; VBITS_EQ_128-NEXT: fmov x23, d26
-; VBITS_EQ_128-NEXT: smulh x21, x21, x26
-; VBITS_EQ_128-NEXT: fmov x26, d0
-; VBITS_EQ_128-NEXT: smulh x20, x20, x27
-; VBITS_EQ_128-NEXT: fmov x27, d3
-; VBITS_EQ_128-NEXT: fmov d20, x17
-; VBITS_EQ_128-NEXT: smulh x7, x7, x23
-; VBITS_EQ_128-NEXT: fmov x23, d4
-; VBITS_EQ_128-NEXT: smulh x6, x6, x24
-; VBITS_EQ_128-NEXT: fmov x24, d5
-; VBITS_EQ_128-NEXT: smulh x26, x26, x27
-; VBITS_EQ_128-NEXT: fmov x27, d7
-; VBITS_EQ_128-NEXT: smulh x3, x3, x23
-; VBITS_EQ_128-NEXT: fmov d19, x20
-; VBITS_EQ_128-NEXT: mov x23, v2.d[1]
-; VBITS_EQ_128-NEXT: smulh x2, x2, x24
-; VBITS_EQ_128-NEXT: mov x24, v1.d[1]
-; VBITS_EQ_128-NEXT: smulh x27, x8, x27
-; VBITS_EQ_128-NEXT: mov x29, v0.d[1]
-; VBITS_EQ_128-NEXT: mov x30, v7.d[1]
-; VBITS_EQ_128-NEXT: mov x8, v5.d[1]
-; VBITS_EQ_128-NEXT: mov x9, v4.d[1]
-; VBITS_EQ_128-NEXT: mov x10, v3.d[1]
-; VBITS_EQ_128-NEXT: ldp d10, d12, [sp, #24] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: smulh x30, x13, x30
-; VBITS_EQ_128-NEXT: fmov d0, x27
-; VBITS_EQ_128-NEXT: smulh x8, x23, x8
-; VBITS_EQ_128-NEXT: fmov d2, x2
-; VBITS_EQ_128-NEXT: smulh x9, x24, x9
-; VBITS_EQ_128-NEXT: fmov d4, x3
-; VBITS_EQ_128-NEXT: smulh x10, x29, x10
-; VBITS_EQ_128-NEXT: fmov d6, x26
-; VBITS_EQ_128-NEXT: mov v11.d[1], v10.d[0]
-; VBITS_EQ_128-NEXT: fmov d1, x30
-; VBITS_EQ_128-NEXT: mov v13.d[1], v12.d[0]
-; VBITS_EQ_128-NEXT: mov v15.d[1], v14.d[0]
-; VBITS_EQ_128-NEXT: mov v9.d[1], v8.d[0]
-; VBITS_EQ_128-NEXT: fmov d3, x8
-; VBITS_EQ_128-NEXT: fmov d5, x9
-; VBITS_EQ_128-NEXT: fmov d7, x10
-; VBITS_EQ_128-NEXT: fmov d17, x6
-; VBITS_EQ_128-NEXT: fmov d16, x7
-; VBITS_EQ_128-NEXT: fmov d18, x21
-; VBITS_EQ_128-NEXT: fmov d21, x11
-; VBITS_EQ_128-NEXT: fmov d22, x12
-; VBITS_EQ_128-NEXT: fmov d23, x15
-; VBITS_EQ_128-NEXT: fmov d24, x28
-; VBITS_EQ_128-NEXT: fmov d25, x25
-; VBITS_EQ_128-NEXT: fmov d26, x22
-; VBITS_EQ_128-NEXT: fmov d27, x19
-; VBITS_EQ_128-NEXT: fmov d28, x5
-; VBITS_EQ_128-NEXT: fmov d29, x4
-; VBITS_EQ_128-NEXT: fmov d30, x1
-; VBITS_EQ_128-NEXT: fmov d31, x18
-; VBITS_EQ_128-NEXT: mov v27.d[1], v26.d[0]
-; VBITS_EQ_128-NEXT: stp q9, q15, [x0, #192]
-; VBITS_EQ_128-NEXT: stp q13, q11, [x0, #224]
-; VBITS_EQ_128-NEXT: mov v31.d[1], v30.d[0]
-; VBITS_EQ_128-NEXT: mov v29.d[1], v28.d[0]
-; VBITS_EQ_128-NEXT: mov v25.d[1], v24.d[0]
-; VBITS_EQ_128-NEXT: mov v23.d[1], v22.d[0]
-; VBITS_EQ_128-NEXT: mov v20.d[1], v21.d[0]
-; VBITS_EQ_128-NEXT: mov v18.d[1], v19.d[0]
-; VBITS_EQ_128-NEXT: stp q29, q31, [x0, #160]
-; VBITS_EQ_128-NEXT: mov v16.d[1], v17.d[0]
-; VBITS_EQ_128-NEXT: stp q25, q27, [x0, #128]
-; VBITS_EQ_128-NEXT: mov v6.d[1], v7.d[0]
-; VBITS_EQ_128-NEXT: mov v4.d[1], v5.d[0]
-; VBITS_EQ_128-NEXT: stp q20, q23, [x0, #96]
-; VBITS_EQ_128-NEXT: mov v2.d[1], v3.d[0]
-; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0]
-; VBITS_EQ_128-NEXT: stp q16, q18, [x0, #64]
-; VBITS_EQ_128-NEXT: ldp x20, x19, [sp, #208] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: stp q4, q6, [x0, #32]
-; VBITS_EQ_128-NEXT: ldp x22, x21, [sp, #192] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: stp q0, q2, [x0]
-; VBITS_EQ_128-NEXT: ldp x24, x23, [sp, #176] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp x26, x25, [sp, #160] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp x28, x27, [sp, #144] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp x29, x30, [sp, #128] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #112] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #96] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #80] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add sp, sp, #224
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: smulh_v32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: smulh_v32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%1 = sext <32 x i64> %op1 to <32 x i128>
@@ -1773,35 +666,29 @@ define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
; Don't use SVE for 64-bit vectors.
; FIXME: The codegen for the >=256 bits case can be improved.
-define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
-; VBITS_EQ_128-LABEL: umulh_v8i8:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: umull v0.8h, v0.8b, v1.8b
-; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: umulh_v8i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: umull v0.8h, v0.8b, v1.8b
-; VBITS_GE_256-NEXT: ushr v1.8h, v0.8h, #8
-; VBITS_GE_256-NEXT: umov w8, v1.h[0]
-; VBITS_GE_256-NEXT: umov w9, v1.h[1]
-; VBITS_GE_256-NEXT: fmov s0, w8
-; VBITS_GE_256-NEXT: umov w8, v1.h[2]
-; VBITS_GE_256-NEXT: mov v0.b[1], w9
-; VBITS_GE_256-NEXT: mov v0.b[2], w8
-; VBITS_GE_256-NEXT: umov w8, v1.h[3]
-; VBITS_GE_256-NEXT: mov v0.b[3], w8
-; VBITS_GE_256-NEXT: umov w8, v1.h[4]
-; VBITS_GE_256-NEXT: mov v0.b[4], w8
-; VBITS_GE_256-NEXT: umov w8, v1.h[5]
-; VBITS_GE_256-NEXT: mov v0.b[5], w8
-; VBITS_GE_256-NEXT: umov w8, v1.h[6]
-; VBITS_GE_256-NEXT: mov v0.b[6], w8
-; VBITS_GE_256-NEXT: umov w8, v1.h[7]
-; VBITS_GE_256-NEXT: mov v0.b[7], w8
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
-; VBITS_GE_256-NEXT: ret
+define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: ushr v1.8h, v0.8h, #8
+; CHECK-NEXT: umov w8, v1.h[0]
+; CHECK-NEXT: umov w9, v1.h[1]
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: umov w8, v1.h[2]
+; CHECK-NEXT: mov v0.b[1], w9
+; CHECK-NEXT: mov v0.b[2], w8
+; CHECK-NEXT: umov w8, v1.h[3]
+; CHECK-NEXT: mov v0.b[3], w8
+; CHECK-NEXT: umov w8, v1.h[4]
+; CHECK-NEXT: mov v0.b[4], w8
+; CHECK-NEXT: umov w8, v1.h[5]
+; CHECK-NEXT: mov v0.b[5], w8
+; CHECK-NEXT: umov w8, v1.h[6]
+; CHECK-NEXT: mov v0.b[6], w8
+; CHECK-NEXT: umov w8, v1.h[7]
+; CHECK-NEXT: mov v0.b[7], w8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%1 = zext <8 x i8> %op1 to <8 x i16>
%2 = zext <8 x i8> %op2 to <8 x i16>
%mul = mul <8 x i16> %1, %2
@@ -1811,7 +698,7 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umulh_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
@@ -1826,30 +713,15 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
ret <16 x i8> %res
}
-define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v32i8:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT: umull v4.8h, v0.8b, v2.8b
-; VBITS_EQ_128-NEXT: umull2 v0.8h, v0.16b, v2.16b
-; VBITS_EQ_128-NEXT: umull v5.8h, v1.8b, v3.8b
-; VBITS_EQ_128-NEXT: umull2 v1.8h, v1.16b, v3.16b
-; VBITS_EQ_128-NEXT: shrn v2.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT: shrn v3.8b, v5.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v2.16b, v0.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v3.16b, v1.8h, #8
-; VBITS_EQ_128-NEXT: stp q2, q3, [x0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: umulh_v32i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_256-NEXT: umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
+define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v32i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%1 = zext <32 x i8> %op1 to <32 x i16>
@@ -1862,40 +734,56 @@ define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v64i8:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #32]
-; VBITS_EQ_128-NEXT: umull2 v6.8h, v0.16b, v2.16b
-; VBITS_EQ_128-NEXT: umull v0.8h, v0.8b, v2.8b
-; VBITS_EQ_128-NEXT: ldp q3, q4, [x0]
-; VBITS_EQ_128-NEXT: umull2 v7.8h, v1.16b, v5.16b
-; VBITS_EQ_128-NEXT: umull v1.8h, v1.8b, v5.8b
-; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v0.16b, v6.8h, #8
-; VBITS_EQ_128-NEXT: ldp q2, q5, [x1]
-; VBITS_EQ_128-NEXT: shrn v1.8b, v1.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v1.16b, v7.8h, #8
-; VBITS_EQ_128-NEXT: umull2 v16.8h, v3.16b, v2.16b
-; VBITS_EQ_128-NEXT: umull v2.8h, v3.8b, v2.8b
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT: umull2 v3.8h, v4.16b, v5.16b
-; VBITS_EQ_128-NEXT: umull v4.8h, v4.8b, v5.8b
-; VBITS_EQ_128-NEXT: shrn v2.8b, v2.8h, #8
-; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v2.16b, v16.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v4.16b, v3.8h, #8
-; VBITS_EQ_128-NEXT: stp q2, q4, [x0]
-; VBITS_EQ_128-NEXT: ret
+; VBITS_GE_256-LABEL: umulh_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ptrue p1.h, vl16
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: uunpklo z4.h, z0.b
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z5.h, z1.b
+; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT: uunpklo z6.h, z2.b
+; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT: uunpklo z7.h, z3.b
+; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT: uunpklo z2.h, z2.b
+; VBITS_GE_256-NEXT: uunpklo z3.h, z3.b
+; VBITS_GE_256-NEXT: mul z0.h, p1/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: movprfx z2, z5
+; VBITS_GE_256-NEXT: mul z2.h, p1/m, z2.h, z7.h
+; VBITS_GE_256-NEXT: mul z1.h, p1/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: mul z4.h, p1/m, z4.h, z6.h
+; VBITS_GE_256-NEXT: lsr z0.h, p1/m, z0.h, #8
+; VBITS_GE_256-NEXT: movprfx z3, z4
+; VBITS_GE_256-NEXT: lsr z3.h, p1/m, z3.h, #8
+; VBITS_GE_256-NEXT: lsr z1.h, p1/m, z1.h, #8
+; VBITS_GE_256-NEXT: lsr z2.h, p1/m, z2.h, #8
+; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: ptrue p1.b, vl16
+; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: splice z3.b, p1, z3.b, z0.b
+; VBITS_GE_256-NEXT: splice z2.b, p1, z2.b, z1.b
+; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; VBITS_GE_1024-LABEL: umulh_v64i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.b, vl64
-; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+; VBITS_GE_512-LABEL: umulh_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%1 = zext <64 x i8> %op1 to <64 x i16>
@@ -1907,64 +795,15 @@ define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v128i8:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #96]
-; VBITS_EQ_128-NEXT: umull2 v6.8h, v0.16b, v2.16b
-; VBITS_EQ_128-NEXT: umull v0.8h, v0.8b, v2.8b
-; VBITS_EQ_128-NEXT: ldp q3, q4, [x0, #64]
-; VBITS_EQ_128-NEXT: umull2 v7.8h, v1.16b, v5.16b
-; VBITS_EQ_128-NEXT: umull v1.8h, v1.8b, v5.8b
-; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v0.16b, v6.8h, #8
-; VBITS_EQ_128-NEXT: ldp q2, q16, [x1, #64]
-; VBITS_EQ_128-NEXT: shrn v1.8b, v1.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v1.16b, v7.8h, #8
-; VBITS_EQ_128-NEXT: umull2 v17.8h, v3.16b, v2.16b
-; VBITS_EQ_128-NEXT: umull v2.8h, v3.8b, v2.8b
-; VBITS_EQ_128-NEXT: ldp q5, q18, [x0, #32]
-; VBITS_EQ_128-NEXT: umull2 v19.8h, v4.16b, v16.16b
-; VBITS_EQ_128-NEXT: umull v4.8h, v4.8b, v16.8b
-; VBITS_EQ_128-NEXT: shrn v2.8b, v2.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v2.16b, v17.8h, #8
-; VBITS_EQ_128-NEXT: ldp q3, q20, [x1, #32]
-; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v4.16b, v19.8h, #8
-; VBITS_EQ_128-NEXT: umull2 v21.8h, v5.16b, v3.16b
-; VBITS_EQ_128-NEXT: umull v3.8h, v5.8b, v3.8b
-; VBITS_EQ_128-NEXT: ldp q16, q22, [x0]
-; VBITS_EQ_128-NEXT: umull2 v23.8h, v18.16b, v20.16b
-; VBITS_EQ_128-NEXT: umull v18.8h, v18.8b, v20.8b
-; VBITS_EQ_128-NEXT: shrn v3.8b, v3.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v3.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT: ldp q5, q24, [x1]
-; VBITS_EQ_128-NEXT: shrn v18.8b, v18.8h, #8
-; VBITS_EQ_128-NEXT: stp q2, q4, [x0, #64]
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: shrn2 v18.16b, v23.8h, #8
-; VBITS_EQ_128-NEXT: umull v20.8h, v16.8b, v5.8b
-; VBITS_EQ_128-NEXT: umull2 v5.8h, v16.16b, v5.16b
-; VBITS_EQ_128-NEXT: stp q3, q18, [x0, #32]
-; VBITS_EQ_128-NEXT: umull v25.8h, v22.8b, v24.8b
-; VBITS_EQ_128-NEXT: umull2 v16.8h, v22.16b, v24.16b
-; VBITS_EQ_128-NEXT: shrn v20.8b, v20.8h, #8
-; VBITS_EQ_128-NEXT: shrn v22.8b, v25.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v20.16b, v5.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v22.16b, v16.8h, #8
-; VBITS_EQ_128-NEXT: stp q20, q22, [x0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: umulh_v128i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
-
+define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: umulh_v128i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%insert = insertelement <128 x i16> undef, i16 8, i64 0
@@ -1978,130 +817,15 @@ define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v256i8:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: sub sp, sp, #96
-; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 96
-; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_offset b8, -8
-; VBITS_EQ_128-NEXT: .cfi_offset b9, -16
-; VBITS_EQ_128-NEXT: .cfi_offset b10, -24
-; VBITS_EQ_128-NEXT: .cfi_offset b11, -32
-; VBITS_EQ_128-NEXT: .cfi_offset b12, -40
-; VBITS_EQ_128-NEXT: .cfi_offset b13, -48
-; VBITS_EQ_128-NEXT: .cfi_offset b14, -56
-; VBITS_EQ_128-NEXT: .cfi_offset b15, -64
-; VBITS_EQ_128-NEXT: ldp q2, q1, [x0, #224]
-; VBITS_EQ_128-NEXT: ldp q6, q3, [x1, #224]
-; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT: umull2 v0.8h, v1.16b, v3.16b
-; VBITS_EQ_128-NEXT: umull v4.8h, v1.8b, v3.8b
-; VBITS_EQ_128-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: ldp q16, q3, [x1, #192]
-; VBITS_EQ_128-NEXT: umull2 v0.8h, v2.16b, v6.16b
-; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT: umull v6.8h, v2.8b, v6.8b
-; VBITS_EQ_128-NEXT: str q0, [sp] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: umull2 v2.8h, v5.16b, v3.16b
-; VBITS_EQ_128-NEXT: shrn v6.8b, v6.8h, #8
-; VBITS_EQ_128-NEXT: umull v5.8h, v5.8b, v3.8b
-; VBITS_EQ_128-NEXT: ldp q19, q18, [x0, #160]
-; VBITS_EQ_128-NEXT: umull2 v3.8h, v7.16b, v16.16b
-; VBITS_EQ_128-NEXT: umull v7.8h, v7.8b, v16.8b
-; VBITS_EQ_128-NEXT: shrn v5.8b, v5.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v5.16b, v2.8h, #8
-; VBITS_EQ_128-NEXT: ldp q16, q17, [x1, #160]
-; VBITS_EQ_128-NEXT: shrn v7.8b, v7.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v7.16b, v3.8h, #8
-; VBITS_EQ_128-NEXT: umull2 v31.8h, v19.16b, v16.16b
-; VBITS_EQ_128-NEXT: umull v9.8h, v19.8b, v16.8b
-; VBITS_EQ_128-NEXT: umull2 v21.8h, v18.16b, v17.16b
-; VBITS_EQ_128-NEXT: umull v30.8h, v18.8b, v17.8b
-; VBITS_EQ_128-NEXT: ldp q22, q17, [x0, #128]
-; VBITS_EQ_128-NEXT: shrn v9.8b, v9.8h, #8
-; VBITS_EQ_128-NEXT: shrn v30.8b, v30.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v9.16b, v31.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v30.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT: ldp q19, q20, [x1, #128]
-; VBITS_EQ_128-NEXT: umull2 v16.8h, v17.16b, v20.16b
-; VBITS_EQ_128-NEXT: ldr q21, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: umull v18.8h, v17.8b, v20.8b
-; VBITS_EQ_128-NEXT: ldp q24, q20, [x0, #96]
-; VBITS_EQ_128-NEXT: umull2 v17.8h, v22.16b, v19.16b
-; VBITS_EQ_128-NEXT: shrn2 v4.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT: umull v19.8h, v22.8b, v19.8b
-; VBITS_EQ_128-NEXT: shrn v2.8b, v18.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v2.16b, v16.8h, #8
-; VBITS_EQ_128-NEXT: ldp q22, q23, [x1, #96]
-; VBITS_EQ_128-NEXT: shrn v3.8b, v19.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v3.16b, v17.8h, #8
-; VBITS_EQ_128-NEXT: umull2 v12.8h, v24.16b, v22.16b
-; VBITS_EQ_128-NEXT: umull v13.8h, v24.8b, v22.8b
-; VBITS_EQ_128-NEXT: umull2 v10.8h, v20.16b, v23.16b
-; VBITS_EQ_128-NEXT: ldr q21, [sp] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: umull v11.8h, v20.8b, v23.8b
-; VBITS_EQ_128-NEXT: ldp q26, q23, [x0, #64]
-; VBITS_EQ_128-NEXT: shrn2 v6.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT: ldp q24, q25, [x1, #64]
-; VBITS_EQ_128-NEXT: umull2 v22.8h, v26.16b, v24.16b
-; VBITS_EQ_128-NEXT: umull v24.8h, v26.8b, v24.8b
-; VBITS_EQ_128-NEXT: umull2 v20.8h, v23.16b, v25.16b
-; VBITS_EQ_128-NEXT: umull v23.8h, v23.8b, v25.8b
-; VBITS_EQ_128-NEXT: ldp q28, q25, [x0, #32]
-; VBITS_EQ_128-NEXT: ldp q26, q27, [x1, #32]
-; VBITS_EQ_128-NEXT: umull2 v15.8h, v28.16b, v26.16b
-; VBITS_EQ_128-NEXT: umull v1.8h, v28.8b, v26.8b
-; VBITS_EQ_128-NEXT: umull2 v14.8h, v25.16b, v27.16b
-; VBITS_EQ_128-NEXT: umull v8.8h, v25.8b, v27.8b
-; VBITS_EQ_128-NEXT: ldp q0, q27, [x0]
-; VBITS_EQ_128-NEXT: shrn v8.8b, v8.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v8.16b, v14.8h, #8
-; VBITS_EQ_128-NEXT: ldp q28, q29, [x1]
-; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #128]
-; VBITS_EQ_128-NEXT: shrn v2.8b, v23.8h, #8
-; VBITS_EQ_128-NEXT: stp q9, q30, [x0, #160]
-; VBITS_EQ_128-NEXT: shrn v3.8b, v24.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v2.16b, v20.8h, #8
-; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT: umull2 v26.8h, v0.16b, v28.16b
-; VBITS_EQ_128-NEXT: shrn2 v3.16b, v22.8h, #8
-; VBITS_EQ_128-NEXT: umull v28.8h, v0.8b, v28.8b
-; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #224]
-; VBITS_EQ_128-NEXT: umull2 v25.8h, v27.16b, v29.16b
-; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #64]
-; VBITS_EQ_128-NEXT: umull v27.8h, v27.8b, v29.8b
-; VBITS_EQ_128-NEXT: shrn v29.8b, v1.8h, #8
-; VBITS_EQ_128-NEXT: shrn v0.8b, v13.8h, #8
-; VBITS_EQ_128-NEXT: shrn v1.8b, v11.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v29.16b, v15.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v0.16b, v12.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v1.16b, v10.8h, #8
-; VBITS_EQ_128-NEXT: stp q29, q8, [x0, #32]
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: shrn v0.8b, v27.8h, #8
-; VBITS_EQ_128-NEXT: shrn v1.8b, v28.8h, #8
-; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: shrn2 v0.16b, v25.8h, #8
-; VBITS_EQ_128-NEXT: shrn2 v1.16b, v26.8h, #8
-; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: stp q1, q0, [x0]
-; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add sp, sp, #96
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: umulh_v256i8:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: umulh_v256i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%1 = zext <256 x i8> %op1 to <256 x i16>
@@ -2115,26 +839,20 @@ define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
; Don't use SVE for 64-bit vectors.
; FIXME: The codegen for the >=256 bits case can be improved.
-define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
-; VBITS_EQ_128-LABEL: umulh_v4i16:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: umull v0.4s, v0.4h, v1.4h
-; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: umulh_v4i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: umull v0.4s, v0.4h, v1.4h
-; VBITS_GE_256-NEXT: ushr v1.4s, v0.4s, #16
-; VBITS_GE_256-NEXT: mov w8, v1.s[1]
-; VBITS_GE_256-NEXT: mov w9, v1.s[2]
-; VBITS_GE_256-NEXT: mov v0.16b, v1.16b
-; VBITS_GE_256-NEXT: mov v0.h[1], w8
-; VBITS_GE_256-NEXT: mov w8, v1.s[3]
-; VBITS_GE_256-NEXT: mov v0.h[2], w9
-; VBITS_GE_256-NEXT: mov v0.h[3], w8
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
-; VBITS_GE_256-NEXT: ret
+define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: ushr v1.4s, v0.4s, #16
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: mov w9, v1.s[2]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: mov v0.h[1], w8
+; CHECK-NEXT: mov w8, v1.s[3]
+; CHECK-NEXT: mov v0.h[2], w9
+; CHECK-NEXT: mov v0.h[3], w8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%1 = zext <4 x i16> %op1 to <4 x i32>
%2 = zext <4 x i16> %op2 to <4 x i32>
%mul = mul <4 x i32> %1, %2
@@ -2144,7 +862,7 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umulh_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
@@ -2159,30 +877,15 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
ret <8 x i16> %res
}
-define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v16i16:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT: umull v4.4s, v0.4h, v2.4h
-; VBITS_EQ_128-NEXT: umull2 v0.4s, v0.8h, v2.8h
-; VBITS_EQ_128-NEXT: umull v5.4s, v1.4h, v3.4h
-; VBITS_EQ_128-NEXT: umull2 v1.4s, v1.8h, v3.8h
-; VBITS_EQ_128-NEXT: shrn v2.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT: shrn v3.4h, v5.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v2.8h, v0.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v3.8h, v1.4s, #16
-; VBITS_EQ_128-NEXT: stp q2, q3, [x0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: umulh_v16i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
+define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%1 = zext <16 x i16> %op1 to <16 x i32>
@@ -2195,40 +898,47 @@ define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v32i16:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #32]
-; VBITS_EQ_128-NEXT: umull2 v6.4s, v0.8h, v2.8h
-; VBITS_EQ_128-NEXT: umull v0.4s, v0.4h, v2.4h
-; VBITS_EQ_128-NEXT: ldp q3, q4, [x0]
-; VBITS_EQ_128-NEXT: umull2 v7.4s, v1.8h, v5.8h
-; VBITS_EQ_128-NEXT: umull v1.4s, v1.4h, v5.4h
-; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v0.8h, v6.4s, #16
-; VBITS_EQ_128-NEXT: ldp q2, q5, [x1]
-; VBITS_EQ_128-NEXT: shrn v1.4h, v1.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v1.8h, v7.4s, #16
-; VBITS_EQ_128-NEXT: umull2 v16.4s, v3.8h, v2.8h
-; VBITS_EQ_128-NEXT: umull v2.4s, v3.4h, v2.4h
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT: umull2 v3.4s, v4.8h, v5.8h
-; VBITS_EQ_128-NEXT: umull v4.4s, v4.4h, v5.4h
-; VBITS_EQ_128-NEXT: shrn v2.4h, v2.4s, #16
-; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v2.8h, v16.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v4.8h, v3.4s, #16
-; VBITS_EQ_128-NEXT: stp q2, q4, [x0]
-; VBITS_EQ_128-NEXT: ret
+; VBITS_GE_256-LABEL: umulh_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ptrue p1.h, vl8
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov z7.d, z1.d
+; VBITS_GE_256-NEXT: mov z16.d, z3.d
+; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16
+; VBITS_GE_256-NEXT: umull2 v4.4s, v0.8h, v2.8h
+; VBITS_GE_256-NEXT: ext z16.b, z16.b, z3.b, #16
+; VBITS_GE_256-NEXT: umull v5.4s, v0.4h, v2.4h
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT: umull2 v6.4s, v1.8h, v3.8h
+; VBITS_GE_256-NEXT: umull v1.4s, v1.4h, v3.4h
+; VBITS_GE_256-NEXT: umull2 v3.4s, v0.8h, v2.8h
+; VBITS_GE_256-NEXT: umull v0.4s, v0.4h, v2.4h
+; VBITS_GE_256-NEXT: umull2 v2.4s, v7.8h, v16.8h
+; VBITS_GE_256-NEXT: umull v7.4s, v7.4h, v16.4h
+; VBITS_GE_256-NEXT: uzp2 v4.8h, v5.8h, v4.8h
+; VBITS_GE_256-NEXT: uzp2 v1.8h, v1.8h, v6.8h
+; VBITS_GE_256-NEXT: uzp2 v0.8h, v0.8h, v3.8h
+; VBITS_GE_256-NEXT: uzp2 v2.8h, v7.8h, v2.8h
+; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z0.h
+; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h
+; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; VBITS_GE_1024-LABEL: umulh_v32i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+; VBITS_GE_512-LABEL: umulh_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%1 = zext <32 x i16> %op1 to <32 x i32>
@@ -2240,63 +950,15 @@ define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v64i16:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #96]
-; VBITS_EQ_128-NEXT: umull2 v6.4s, v0.8h, v2.8h
-; VBITS_EQ_128-NEXT: umull v0.4s, v0.4h, v2.4h
-; VBITS_EQ_128-NEXT: ldp q3, q4, [x0, #64]
-; VBITS_EQ_128-NEXT: umull2 v7.4s, v1.8h, v5.8h
-; VBITS_EQ_128-NEXT: umull v1.4s, v1.4h, v5.4h
-; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v0.8h, v6.4s, #16
-; VBITS_EQ_128-NEXT: ldp q2, q16, [x1, #64]
-; VBITS_EQ_128-NEXT: shrn v1.4h, v1.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v1.8h, v7.4s, #16
-; VBITS_EQ_128-NEXT: umull2 v17.4s, v3.8h, v2.8h
-; VBITS_EQ_128-NEXT: umull v2.4s, v3.4h, v2.4h
-; VBITS_EQ_128-NEXT: ldp q5, q18, [x0, #32]
-; VBITS_EQ_128-NEXT: umull2 v19.4s, v4.8h, v16.8h
-; VBITS_EQ_128-NEXT: umull v4.4s, v4.4h, v16.4h
-; VBITS_EQ_128-NEXT: shrn v2.4h, v2.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v2.8h, v17.4s, #16
-; VBITS_EQ_128-NEXT: ldp q3, q20, [x1, #32]
-; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v4.8h, v19.4s, #16
-; VBITS_EQ_128-NEXT: umull2 v21.4s, v5.8h, v3.8h
-; VBITS_EQ_128-NEXT: umull v3.4s, v5.4h, v3.4h
-; VBITS_EQ_128-NEXT: ldp q16, q22, [x0]
-; VBITS_EQ_128-NEXT: umull2 v23.4s, v18.8h, v20.8h
-; VBITS_EQ_128-NEXT: umull v18.4s, v18.4h, v20.4h
-; VBITS_EQ_128-NEXT: shrn v3.4h, v3.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v3.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT: ldp q5, q24, [x1]
-; VBITS_EQ_128-NEXT: shrn v18.4h, v18.4s, #16
-; VBITS_EQ_128-NEXT: stp q2, q4, [x0, #64]
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: shrn2 v18.8h, v23.4s, #16
-; VBITS_EQ_128-NEXT: umull v20.4s, v16.4h, v5.4h
-; VBITS_EQ_128-NEXT: umull2 v5.4s, v16.8h, v5.8h
-; VBITS_EQ_128-NEXT: stp q3, q18, [x0, #32]
-; VBITS_EQ_128-NEXT: umull v25.4s, v22.4h, v24.4h
-; VBITS_EQ_128-NEXT: umull2 v16.4s, v22.8h, v24.8h
-; VBITS_EQ_128-NEXT: shrn v20.4h, v20.4s, #16
-; VBITS_EQ_128-NEXT: shrn v22.4h, v25.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v20.8h, v5.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v22.8h, v16.4s, #16
-; VBITS_EQ_128-NEXT: stp q20, q22, [x0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: umulh_v64i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: umulh_v64i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%1 = zext <64 x i16> %op1 to <64 x i32>
@@ -2308,130 +970,15 @@ define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v128i16:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: sub sp, sp, #96
-; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 96
-; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_offset b8, -8
-; VBITS_EQ_128-NEXT: .cfi_offset b9, -16
-; VBITS_EQ_128-NEXT: .cfi_offset b10, -24
-; VBITS_EQ_128-NEXT: .cfi_offset b11, -32
-; VBITS_EQ_128-NEXT: .cfi_offset b12, -40
-; VBITS_EQ_128-NEXT: .cfi_offset b13, -48
-; VBITS_EQ_128-NEXT: .cfi_offset b14, -56
-; VBITS_EQ_128-NEXT: .cfi_offset b15, -64
-; VBITS_EQ_128-NEXT: ldp q2, q1, [x0, #224]
-; VBITS_EQ_128-NEXT: ldp q6, q3, [x1, #224]
-; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT: umull2 v0.4s, v1.8h, v3.8h
-; VBITS_EQ_128-NEXT: umull v4.4s, v1.4h, v3.4h
-; VBITS_EQ_128-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: ldp q16, q3, [x1, #192]
-; VBITS_EQ_128-NEXT: umull2 v0.4s, v2.8h, v6.8h
-; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT: umull v6.4s, v2.4h, v6.4h
-; VBITS_EQ_128-NEXT: str q0, [sp] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: umull2 v2.4s, v5.8h, v3.8h
-; VBITS_EQ_128-NEXT: shrn v6.4h, v6.4s, #16
-; VBITS_EQ_128-NEXT: umull v5.4s, v5.4h, v3.4h
-; VBITS_EQ_128-NEXT: ldp q19, q18, [x0, #160]
-; VBITS_EQ_128-NEXT: umull2 v3.4s, v7.8h, v16.8h
-; VBITS_EQ_128-NEXT: umull v7.4s, v7.4h, v16.4h
-; VBITS_EQ_128-NEXT: shrn v5.4h, v5.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v5.8h, v2.4s, #16
-; VBITS_EQ_128-NEXT: ldp q16, q17, [x1, #160]
-; VBITS_EQ_128-NEXT: shrn v7.4h, v7.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v7.8h, v3.4s, #16
-; VBITS_EQ_128-NEXT: umull2 v31.4s, v19.8h, v16.8h
-; VBITS_EQ_128-NEXT: umull v9.4s, v19.4h, v16.4h
-; VBITS_EQ_128-NEXT: umull2 v21.4s, v18.8h, v17.8h
-; VBITS_EQ_128-NEXT: umull v30.4s, v18.4h, v17.4h
-; VBITS_EQ_128-NEXT: ldp q22, q17, [x0, #128]
-; VBITS_EQ_128-NEXT: shrn v9.4h, v9.4s, #16
-; VBITS_EQ_128-NEXT: shrn v30.4h, v30.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v9.8h, v31.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v30.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT: ldp q19, q20, [x1, #128]
-; VBITS_EQ_128-NEXT: umull2 v16.4s, v17.8h, v20.8h
-; VBITS_EQ_128-NEXT: ldr q21, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: umull v18.4s, v17.4h, v20.4h
-; VBITS_EQ_128-NEXT: ldp q24, q20, [x0, #96]
-; VBITS_EQ_128-NEXT: umull2 v17.4s, v22.8h, v19.8h
-; VBITS_EQ_128-NEXT: shrn2 v4.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT: umull v19.4s, v22.4h, v19.4h
-; VBITS_EQ_128-NEXT: shrn v2.4h, v18.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v2.8h, v16.4s, #16
-; VBITS_EQ_128-NEXT: ldp q22, q23, [x1, #96]
-; VBITS_EQ_128-NEXT: shrn v3.4h, v19.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v3.8h, v17.4s, #16
-; VBITS_EQ_128-NEXT: umull2 v12.4s, v24.8h, v22.8h
-; VBITS_EQ_128-NEXT: umull v13.4s, v24.4h, v22.4h
-; VBITS_EQ_128-NEXT: umull2 v10.4s, v20.8h, v23.8h
-; VBITS_EQ_128-NEXT: ldr q21, [sp] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: umull v11.4s, v20.4h, v23.4h
-; VBITS_EQ_128-NEXT: ldp q26, q23, [x0, #64]
-; VBITS_EQ_128-NEXT: shrn2 v6.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT: ldp q24, q25, [x1, #64]
-; VBITS_EQ_128-NEXT: umull2 v22.4s, v26.8h, v24.8h
-; VBITS_EQ_128-NEXT: umull v24.4s, v26.4h, v24.4h
-; VBITS_EQ_128-NEXT: umull2 v20.4s, v23.8h, v25.8h
-; VBITS_EQ_128-NEXT: umull v23.4s, v23.4h, v25.4h
-; VBITS_EQ_128-NEXT: ldp q28, q25, [x0, #32]
-; VBITS_EQ_128-NEXT: ldp q26, q27, [x1, #32]
-; VBITS_EQ_128-NEXT: umull2 v15.4s, v28.8h, v26.8h
-; VBITS_EQ_128-NEXT: umull v1.4s, v28.4h, v26.4h
-; VBITS_EQ_128-NEXT: umull2 v14.4s, v25.8h, v27.8h
-; VBITS_EQ_128-NEXT: umull v8.4s, v25.4h, v27.4h
-; VBITS_EQ_128-NEXT: ldp q0, q27, [x0]
-; VBITS_EQ_128-NEXT: shrn v8.4h, v8.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v8.8h, v14.4s, #16
-; VBITS_EQ_128-NEXT: ldp q28, q29, [x1]
-; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #128]
-; VBITS_EQ_128-NEXT: shrn v2.4h, v23.4s, #16
-; VBITS_EQ_128-NEXT: stp q9, q30, [x0, #160]
-; VBITS_EQ_128-NEXT: shrn v3.4h, v24.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v2.8h, v20.4s, #16
-; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT: umull2 v26.4s, v0.8h, v28.8h
-; VBITS_EQ_128-NEXT: shrn2 v3.8h, v22.4s, #16
-; VBITS_EQ_128-NEXT: umull v28.4s, v0.4h, v28.4h
-; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #224]
-; VBITS_EQ_128-NEXT: umull2 v25.4s, v27.8h, v29.8h
-; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #64]
-; VBITS_EQ_128-NEXT: umull v27.4s, v27.4h, v29.4h
-; VBITS_EQ_128-NEXT: shrn v29.4h, v1.4s, #16
-; VBITS_EQ_128-NEXT: shrn v0.4h, v13.4s, #16
-; VBITS_EQ_128-NEXT: shrn v1.4h, v11.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v29.8h, v15.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v0.8h, v12.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v1.8h, v10.4s, #16
-; VBITS_EQ_128-NEXT: stp q29, q8, [x0, #32]
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: shrn v0.4h, v27.4s, #16
-; VBITS_EQ_128-NEXT: shrn v1.4h, v28.4s, #16
-; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: shrn2 v0.8h, v25.4s, #16
-; VBITS_EQ_128-NEXT: shrn2 v1.8h, v26.4s, #16
-; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: stp q1, q0, [x0]
-; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add sp, sp, #96
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: umulh_v128i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: umulh_v128i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%1 = zext <128 x i16> %op1 to <128 x i32>
@@ -2444,7 +991,7 @@ define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Vector i64 multiplications are not legal for NEON so use SVE when available.
-define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umulh_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
@@ -2453,8 +1000,6 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: shrn v0.2s, v0.2d, #32
; CHECK-NEXT: ret
-
-
%1 = zext <2 x i32> %op1 to <2 x i64>
%2 = zext <2 x i32> %op2 to <2 x i64>
%mul = mul <2 x i64> %1, %2
@@ -2464,7 +1009,7 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umulh_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
@@ -2479,39 +1024,15 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
ret <4 x i32> %res
}
-define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v8i32:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT: ushll v5.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT: ushll v4.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v1.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT: ushll v7.2d, v2.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v2.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT: ushll v6.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z7.d
-; VBITS_EQ_128-NEXT: ushll2 v3.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d
-; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z6.d
-; VBITS_EQ_128-NEXT: shrn v5.2s, v5.2d, #32
-; VBITS_EQ_128-NEXT: shrn v2.2s, v4.2d, #32
-; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z3.d
-; VBITS_EQ_128-NEXT: shrn2 v5.4s, v0.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v2.4s, v1.2d, #32
-; VBITS_EQ_128-NEXT: stp q5, q2, [x0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: umulh_v8i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
+define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%insert = insertelement <8 x i64> undef, i64 32, i64 0
@@ -2526,57 +1047,47 @@ define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v16i32:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q1, q2, [x0, #32]
-; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT: ushll v19.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v1.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT: ldp q3, q4, [x0]
-; VBITS_EQ_128-NEXT: ushll v18.2d, v2.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v2.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT: ushll v7.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v3.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #32]
-; VBITS_EQ_128-NEXT: ushll v0.2d, v4.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v4.2d, v4.4s, #0
-; VBITS_EQ_128-NEXT: ushll2 v21.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT: ushll v5.2d, v5.2s, #0
-; VBITS_EQ_128-NEXT: ldp q16, q17, [x1]
-; VBITS_EQ_128-NEXT: ushll2 v22.2d, v6.4s, #0
-; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z19.d
-; VBITS_EQ_128-NEXT: ushll v6.2d, v6.2s, #0
-; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z21.d
-; VBITS_EQ_128-NEXT: shrn v5.2s, v5.2d, #32
-; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z22.d
-; VBITS_EQ_128-NEXT: ushll v19.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z18.d
-; VBITS_EQ_128-NEXT: ushll2 v16.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT: ushll v20.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z19.d
-; VBITS_EQ_128-NEXT: ushll2 v17.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT: mul z3.d, p0/m, z3.d, z16.d
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z20.d
-; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT: shrn v7.2s, v7.2d, #32
-; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32
-; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z17.d
-; VBITS_EQ_128-NEXT: shrn2 v5.4s, v1.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v6.4s, v2.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v7.4s, v3.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v0.4s, v4.2d, #32
-; VBITS_EQ_128-NEXT: stp q5, q6, [x0, #32]
-; VBITS_EQ_128-NEXT: stp q7, q0, [x0]
-; VBITS_EQ_128-NEXT: ret
+; VBITS_GE_256-LABEL: umulh_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ptrue p1.s, vl4
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov z7.d, z1.d
+; VBITS_GE_256-NEXT: mov z16.d, z3.d
+; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16
+; VBITS_GE_256-NEXT: umull2 v4.2d, v0.4s, v2.4s
+; VBITS_GE_256-NEXT: ext z16.b, z16.b, z3.b, #16
+; VBITS_GE_256-NEXT: umull v5.2d, v0.2s, v2.2s
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT: umull2 v6.2d, v1.4s, v3.4s
+; VBITS_GE_256-NEXT: umull v1.2d, v1.2s, v3.2s
+; VBITS_GE_256-NEXT: umull2 v3.2d, v0.4s, v2.4s
+; VBITS_GE_256-NEXT: umull v0.2d, v0.2s, v2.2s
+; VBITS_GE_256-NEXT: umull2 v2.2d, v7.4s, v16.4s
+; VBITS_GE_256-NEXT: umull v7.2d, v7.2s, v16.2s
+; VBITS_GE_256-NEXT: uzp2 v4.4s, v5.4s, v4.4s
+; VBITS_GE_256-NEXT: uzp2 v1.4s, v1.4s, v6.4s
+; VBITS_GE_256-NEXT: uzp2 v0.4s, v0.4s, v3.4s
+; VBITS_GE_256-NEXT: uzp2 v2.4s, v7.4s, v2.4s
+; VBITS_GE_256-NEXT: splice z4.s, p1, z4.s, z0.s
+; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z2.s
+; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; VBITS_GE_1024-LABEL: umulh_v16i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+; VBITS_GE_512-LABEL: umulh_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%1 = zext <16 x i32> %op1 to <16 x i64>
@@ -2588,104 +1099,15 @@ define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v32i32:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 32
-; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_offset b8, -8
-; VBITS_EQ_128-NEXT: .cfi_offset b9, -16
-; VBITS_EQ_128-NEXT: .cfi_offset b10, -32
-; VBITS_EQ_128-NEXT: ldp q17, q16, [x0, #64]
-; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT: ushll v27.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v29.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT: ldp q23, q28, [x0, #96]
-; VBITS_EQ_128-NEXT: ushll v19.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v22.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT: ushll v31.2d, v23.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v8.2d, v23.4s, #0
-; VBITS_EQ_128-NEXT: ldp q26, q25, [x1, #96]
-; VBITS_EQ_128-NEXT: ushll v30.2d, v28.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v28.2d, v28.4s, #0
-; VBITS_EQ_128-NEXT: ushll2 v9.2d, v26.4s, #0
-; VBITS_EQ_128-NEXT: ushll v26.2d, v26.2s, #0
-; VBITS_EQ_128-NEXT: ldp q24, q21, [x1, #64]
-; VBITS_EQ_128-NEXT: mul z26.d, p0/m, z26.d, z31.d
-; VBITS_EQ_128-NEXT: mul z8.d, p0/m, z8.d, z9.d
-; VBITS_EQ_128-NEXT: ushll2 v10.2d, v25.4s, #0
-; VBITS_EQ_128-NEXT: ushll v25.2d, v25.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v31.2d, v24.4s, #0
-; VBITS_EQ_128-NEXT: mul z28.d, p0/m, z28.d, z10.d
-; VBITS_EQ_128-NEXT: ushll v24.2d, v24.2s, #0
-; VBITS_EQ_128-NEXT: mul z25.d, p0/m, z25.d, z30.d
-; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #32]
-; VBITS_EQ_128-NEXT: mul z24.d, p0/m, z24.d, z27.d
-; VBITS_EQ_128-NEXT: mul z29.d, p0/m, z29.d, z31.d
-; VBITS_EQ_128-NEXT: ushll2 v30.2d, v21.4s, #0
-; VBITS_EQ_128-NEXT: ushll v21.2d, v21.2s, #0
-; VBITS_EQ_128-NEXT: ushll v6.2d, v7.2s, #0
-; VBITS_EQ_128-NEXT: mul z22.d, p0/m, z22.d, z30.d
-; VBITS_EQ_128-NEXT: mul z19.d, p0/m, z19.d, z21.d
-; VBITS_EQ_128-NEXT: ldp q20, q18, [x1, #32]
-; VBITS_EQ_128-NEXT: ushll v4.2d, v5.2s, #0
-; VBITS_EQ_128-NEXT: shrn v19.2s, v19.2d, #32
-; VBITS_EQ_128-NEXT: ushll2 v5.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT: ushll2 v7.2d, v7.4s, #0
-; VBITS_EQ_128-NEXT: ushll2 v27.2d, v20.4s, #0
-; VBITS_EQ_128-NEXT: ushll v20.2d, v20.2s, #0
-; VBITS_EQ_128-NEXT: ldp q3, q1, [x0]
-; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z20.d
-; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z27.d
-; VBITS_EQ_128-NEXT: ushll2 v21.2d, v18.4s, #0
-; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT: ushll v18.2d, v18.2s, #0
-; VBITS_EQ_128-NEXT: ushll v2.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z21.d
-; VBITS_EQ_128-NEXT: ushll2 v3.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z18.d
-; VBITS_EQ_128-NEXT: ldp q16, q17, [x1]
-; VBITS_EQ_128-NEXT: ushll v0.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT: shrn v4.2s, v4.2d, #32
-; VBITS_EQ_128-NEXT: ushll2 v1.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT: shrn v18.2s, v24.2d, #32
-; VBITS_EQ_128-NEXT: ushll v20.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT: shrn2 v19.4s, v22.2d, #32
-; VBITS_EQ_128-NEXT: ushll2 v16.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT: ushll v23.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z20.d
-; VBITS_EQ_128-NEXT: ushll2 v17.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT: mul z3.d, p0/m, z3.d, z16.d
-; VBITS_EQ_128-NEXT: shrn v16.2s, v26.2d, #32
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z23.d
-; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z17.d
-; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32
-; VBITS_EQ_128-NEXT: shrn v2.2s, v2.2d, #32
-; VBITS_EQ_128-NEXT: shrn v17.2s, v25.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v16.4s, v8.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v18.4s, v29.2d, #32
-; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: shrn2 v17.4s, v28.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v6.4s, v7.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v4.4s, v5.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v2.4s, v3.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v0.4s, v1.2d, #32
-; VBITS_EQ_128-NEXT: stp q18, q19, [x0, #64]
-; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #32]
-; VBITS_EQ_128-NEXT: stp q2, q0, [x0]
-; VBITS_EQ_128-NEXT: stp q16, q17, [x0, #96]
-; VBITS_EQ_128-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: umulh_v32i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: umulh_v32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%1 = zext <32 x i32> %op1 to <32 x i64>
@@ -2697,276 +1119,15 @@ define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v64i32:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 80
-; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_offset w29, -16
-; VBITS_EQ_128-NEXT: .cfi_offset b8, -24
-; VBITS_EQ_128-NEXT: .cfi_offset b9, -32
-; VBITS_EQ_128-NEXT: .cfi_offset b10, -40
-; VBITS_EQ_128-NEXT: .cfi_offset b11, -48
-; VBITS_EQ_128-NEXT: .cfi_offset b12, -56
-; VBITS_EQ_128-NEXT: .cfi_offset b13, -64
-; VBITS_EQ_128-NEXT: .cfi_offset b14, -72
-; VBITS_EQ_128-NEXT: .cfi_offset b15, -80
-; VBITS_EQ_128-NEXT: addvl sp, sp, #-12
-; VBITS_EQ_128-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 96 * VG
-; VBITS_EQ_128-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xa0, 0x01, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 160 + 96 * VG
-; VBITS_EQ_128-NEXT: ldp q4, q5, [x0, #96]
-; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT: stp q5, q4, [sp, #-80]! // 32-byte Folded Spill
-; VBITS_EQ_128-NEXT: ldp q0, q2, [x0, #48]
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ldr q1, [x0, #32]
-; VBITS_EQ_128-NEXT: ldr q3, [x0, #80]
-; VBITS_EQ_128-NEXT: str q1, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: ushll v1.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT: stp q3, q2, [sp, #32] // 32-byte Folded Spill
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT: str z1, [x8, #11, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: str z0, [x8, #10, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT: str z0, [x8, #9, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT: ldp q23, q26, [x0, #128]
-; VBITS_EQ_128-NEXT: str z0, [x8, #8, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v4.4s, #0
-; VBITS_EQ_128-NEXT: str z0, [x8, #7, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT: ldp q25, q24, [x0, #160]
-; VBITS_EQ_128-NEXT: str z0, [x8, #6, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v23.4s, #0
-; VBITS_EQ_128-NEXT: ushll2 v1.2d, v26.4s, #0
-; VBITS_EQ_128-NEXT: str z0, [x8, #5, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll2 v27.2d, v25.4s, #0
-; VBITS_EQ_128-NEXT: ldp q30, q0, [x0, #192]
-; VBITS_EQ_128-NEXT: str z1, [x8, #4, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll2 v9.2d, v24.4s, #0
-; VBITS_EQ_128-NEXT: ushll2 v12.2d, v30.4s, #0
-; VBITS_EQ_128-NEXT: ldp q31, q1, [x0, #224]
-; VBITS_EQ_128-NEXT: ushll v11.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v8.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT: ushll v10.2d, v31.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v15.2d, v31.4s, #0
-; VBITS_EQ_128-NEXT: ldp q29, q28, [x1, #224]
-; VBITS_EQ_128-NEXT: ushll2 v18.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT: ushll v31.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v2.2d, v29.4s, #0
-; VBITS_EQ_128-NEXT: ldp q14, q0, [x1, #192]
-; VBITS_EQ_128-NEXT: ushll v1.2d, v28.2s, #0
-; VBITS_EQ_128-NEXT: ushll v20.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v19.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v28.4s, #0
-; VBITS_EQ_128-NEXT: mul z11.d, p0/m, z11.d, z20.d
-; VBITS_EQ_128-NEXT: ldp q21, q22, [x0]
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z18.d
-; VBITS_EQ_128-NEXT: ushll v18.2d, v29.2s, #0
-; VBITS_EQ_128-NEXT: ushll v20.2d, v14.2s, #0
-; VBITS_EQ_128-NEXT: ldp q4, q13, [x1, #160]
-; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #128]
-; VBITS_EQ_128-NEXT: ldp q7, q3, [x1, #96]
-; VBITS_EQ_128-NEXT: str z0, [x8, #3, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ldp q17, q16, [x1, #64]
-; VBITS_EQ_128-NEXT: movprfx z0, z31
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: movprfx z0, z15
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d
-; VBITS_EQ_128-NEXT: ushll v1.2d, v30.2s, #0
-; VBITS_EQ_128-NEXT: str z0, [x8, #2, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ldp q2, q29, [x1, #32]
-; VBITS_EQ_128-NEXT: movprfx z15, z10
-; VBITS_EQ_128-NEXT: mul z15.d, p0/m, z15.d, z18.d
-; VBITS_EQ_128-NEXT: movprfx z0, z8
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z19.d
-; VBITS_EQ_128-NEXT: str z0, [x8] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v14.4s, #0
-; VBITS_EQ_128-NEXT: ldp q19, q18, [x1]
-; VBITS_EQ_128-NEXT: movprfx z10, z12
-; VBITS_EQ_128-NEXT: mul z10.d, p0/m, z10.d, z0.d
-; VBITS_EQ_128-NEXT: movprfx z8, z1
-; VBITS_EQ_128-NEXT: mul z8.d, p0/m, z8.d, z20.d
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v13.4s, #0
-; VBITS_EQ_128-NEXT: ushll v12.2d, v24.2s, #0
-; VBITS_EQ_128-NEXT: ushll v1.2d, v13.2s, #0
-; VBITS_EQ_128-NEXT: mul z9.d, p0/m, z9.d, z0.d
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v4.4s, #0
-; VBITS_EQ_128-NEXT: mul z12.d, p0/m, z12.d, z1.d
-; VBITS_EQ_128-NEXT: ushll v1.2d, v4.2s, #0
-; VBITS_EQ_128-NEXT: mul z27.d, p0/m, z27.d, z0.d
-; VBITS_EQ_128-NEXT: ushll v20.2d, v25.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z13, z20
-; VBITS_EQ_128-NEXT: mul z13.d, p0/m, z13.d, z1.d
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v6.4s, #0
-; VBITS_EQ_128-NEXT: ushll v1.2d, v6.2s, #0
-; VBITS_EQ_128-NEXT: ldr z6, [x8, #4, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: movprfx z14, z6
-; VBITS_EQ_128-NEXT: mul z14.d, p0/m, z14.d, z0.d
-; VBITS_EQ_128-NEXT: ushll v4.2d, v26.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z30, z4
-; VBITS_EQ_128-NEXT: mul z30.d, p0/m, z30.d, z1.d
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT: ldr z4, [x8, #5, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll v1.2d, v5.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z31, z4
-; VBITS_EQ_128-NEXT: mul z31.d, p0/m, z31.d, z0.d
-; VBITS_EQ_128-NEXT: ushll v6.2d, v23.2s, #0
-; VBITS_EQ_128-NEXT: ldr q4, [sp] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT: movprfx z28, z6
-; VBITS_EQ_128-NEXT: mul z28.d, p0/m, z28.d, z1.d
-; VBITS_EQ_128-NEXT: ushll v1.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT: ldr z3, [x8, #6, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: movprfx z23, z3
-; VBITS_EQ_128-NEXT: mul z23.d, p0/m, z23.d, z0.d
-; VBITS_EQ_128-NEXT: ushll v5.2d, v4.2s, #0
-; VBITS_EQ_128-NEXT: ldr q3, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: movprfx z20, z5
-; VBITS_EQ_128-NEXT: mul z20.d, p0/m, z20.d, z1.d
-; VBITS_EQ_128-NEXT: ldr z1, [x8, #7, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v7.4s, #0
-; VBITS_EQ_128-NEXT: ushll v4.2d, v7.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z7, z1
-; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z0.d
-; VBITS_EQ_128-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ushll v3.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z6, z3
-; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z4.d
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT: ushll v5.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT: ldr z1, [x8, #8, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: movprfx z26, z1
-; VBITS_EQ_128-NEXT: mul z26.d, p0/m, z26.d, z0.d
-; VBITS_EQ_128-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll v3.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z24, z5
-; VBITS_EQ_128-NEXT: mul z24.d, p0/m, z24.d, z3.d
-; VBITS_EQ_128-NEXT: ushll v16.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT: ldr z1, [x8, #9, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT: movprfx z25, z1
-; VBITS_EQ_128-NEXT: mul z25.d, p0/m, z25.d, z0.d
-; VBITS_EQ_128-NEXT: ushll v5.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v29.4s, #0
-; VBITS_EQ_128-NEXT: ushll v17.2d, v29.2s, #0
-; VBITS_EQ_128-NEXT: movprfx z29, z16
-; VBITS_EQ_128-NEXT: mul z29.d, p0/m, z29.d, z5.d
-; VBITS_EQ_128-NEXT: ldr z1, [x8, #10, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: movprfx z4, z1
-; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z0.d
-; VBITS_EQ_128-NEXT: ushll v5.2d, v22.2s, #0
-; VBITS_EQ_128-NEXT: ldr z0, [x8, #11, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: ushll2 v16.2d, v22.4s, #0
-; VBITS_EQ_128-NEXT: movprfx z22, z0
-; VBITS_EQ_128-NEXT: mul z22.d, p0/m, z22.d, z17.d
-; VBITS_EQ_128-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ushll v1.2d, v2.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v2.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT: ushll v17.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT: ushll2 v0.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT: ushll v3.2d, v18.2s, #0
-; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z17.d
-; VBITS_EQ_128-NEXT: ushll2 v18.2d, v18.4s, #0
-; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d
-; VBITS_EQ_128-NEXT: movprfx z2, z5
-; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z3.d
-; VBITS_EQ_128-NEXT: mul z18.d, p0/m, z18.d, z16.d
-; VBITS_EQ_128-NEXT: ushll2 v5.2d, v21.4s, #0
-; VBITS_EQ_128-NEXT: ushll2 v16.2d, v19.4s, #0
-; VBITS_EQ_128-NEXT: ushll v17.2d, v19.2s, #0
-; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z16.d
-; VBITS_EQ_128-NEXT: shrn v16.2s, v1.2d, #32
-; VBITS_EQ_128-NEXT: ushll v3.2d, v21.2s, #0
-; VBITS_EQ_128-NEXT: shrn v21.2s, v22.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v16.4s, v0.2d, #32
-; VBITS_EQ_128-NEXT: shrn v0.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT: ldr z6, [x8, #1, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: shrn v1.2s, v20.2d, #32
-; VBITS_EQ_128-NEXT: mul z17.d, p0/m, z17.d, z3.d
-; VBITS_EQ_128-NEXT: shrn2 v21.4s, v4.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v0.4s, v7.2d, #32
-; VBITS_EQ_128-NEXT: shrn v3.2s, v13.2d, #32
-; VBITS_EQ_128-NEXT: ldr z19, [x8, #3, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: shrn v4.2s, v12.2d, #32
-; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT: shrn v7.2s, v15.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v1.4s, v23.2d, #32
-; VBITS_EQ_128-NEXT: ldr z20, [x8, #2, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add x8, sp, #80
-; VBITS_EQ_128-NEXT: shrn2 v3.4s, v27.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v4.4s, v9.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v6.4s, v19.2d, #32
-; VBITS_EQ_128-NEXT: shrn v19.2s, v11.2d, #32
-; VBITS_EQ_128-NEXT: ldr z22, [x8] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: stp q16, q21, [x0, #32]
-; VBITS_EQ_128-NEXT: shrn2 v7.4s, v20.2d, #32
-; VBITS_EQ_128-NEXT: shrn v20.2s, v8.2d, #32
-; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: shrn v0.2s, v2.2d, #32
-; VBITS_EQ_128-NEXT: stp q3, q4, [x0, #160]
-; VBITS_EQ_128-NEXT: shrn v3.2s, v24.2d, #32
-; VBITS_EQ_128-NEXT: stp q7, q6, [x0, #224]
-; VBITS_EQ_128-NEXT: shrn v6.2s, v30.2d, #32
-; VBITS_EQ_128-NEXT: shrn v7.2s, v28.2d, #32
-; VBITS_EQ_128-NEXT: shrn v4.2s, v29.2d, #32
-; VBITS_EQ_128-NEXT: shrn v1.2s, v17.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v19.4s, v22.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v20.4s, v10.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v6.4s, v14.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v7.4s, v31.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v3.4s, v26.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v4.4s, v25.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v0.4s, v18.2d, #32
-; VBITS_EQ_128-NEXT: shrn2 v1.4s, v5.2d, #32
-; VBITS_EQ_128-NEXT: stp q7, q6, [x0, #128]
-; VBITS_EQ_128-NEXT: stp q4, q3, [x0, #64]
-; VBITS_EQ_128-NEXT: stp q1, q0, [x0]
-; VBITS_EQ_128-NEXT: stp q20, q19, [x0, #192]
-; VBITS_EQ_128-NEXT: addvl sp, sp, #12
-; VBITS_EQ_128-NEXT: add sp, sp, #80
-; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: umulh_v64i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: umulh_v64i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%1 = zext <64 x i32> %op1 to <64 x i64>
@@ -2979,25 +1140,15 @@ define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Vector i64 multiplications are not legal for NEON so use SVE when available.
-define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
-; VBITS_EQ_128-LABEL: umulh_v1i64:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: // kill: def $d1 killed $d1 def $q1
-; VBITS_EQ_128-NEXT: // kill: def $d0 killed $d0 def $q0
-; VBITS_EQ_128-NEXT: fmov x8, d0
-; VBITS_EQ_128-NEXT: fmov x9, d1
-; VBITS_EQ_128-NEXT: umulh x8, x8, x9
-; VBITS_EQ_128-NEXT: fmov d0, x8
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: umulh_v1i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
-; VBITS_GE_256-NEXT: ptrue p0.d, vl1
-; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
-; VBITS_GE_256-NEXT: umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
-; VBITS_GE_256-NEXT: ret
+define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%1 = zext <1 x i64> %op1 to <1 x i128>
%2 = zext <1 x i64> %op2 to <1 x i128>
%mul = mul <1 x i128> %1, %2
@@ -3007,28 +1158,15 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
}
; Vector i64 multiplications are not legal for NEON so use SVE when available.
-define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
-; VBITS_EQ_128-LABEL: umulh_v2i64:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: mov x8, v0.d[1]
-; VBITS_EQ_128-NEXT: fmov x10, d0
-; VBITS_EQ_128-NEXT: mov x9, v1.d[1]
-; VBITS_EQ_128-NEXT: fmov x11, d1
-; VBITS_EQ_128-NEXT: umulh x10, x10, x11
-; VBITS_EQ_128-NEXT: umulh x8, x8, x9
-; VBITS_EQ_128-NEXT: fmov d0, x10
-; VBITS_EQ_128-NEXT: fmov d1, x8
-; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: umulh_v2i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_256-NEXT: ptrue p0.d, vl2
-; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_256-NEXT: umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
-; VBITS_GE_256-NEXT: ret
+define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%1 = zext <2 x i64> %op1 to <2 x i128>
%2 = zext <2 x i64> %op2 to <2 x i128>
%mul = mul <2 x i128> %1, %2
@@ -3037,40 +1175,15 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
ret <2 x i64> %res
}
-define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v4i64:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT: mov x10, v0.d[1]
-; VBITS_EQ_128-NEXT: fmov x11, d0
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT: mov x8, v1.d[1]
-; VBITS_EQ_128-NEXT: fmov x9, d1
-; VBITS_EQ_128-NEXT: mov x12, v2.d[1]
-; VBITS_EQ_128-NEXT: fmov x13, d2
-; VBITS_EQ_128-NEXT: mov x14, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x15, d3
-; VBITS_EQ_128-NEXT: umulh x11, x11, x13
-; VBITS_EQ_128-NEXT: umulh x10, x10, x12
-; VBITS_EQ_128-NEXT: umulh x9, x9, x15
-; VBITS_EQ_128-NEXT: umulh x8, x8, x14
-; VBITS_EQ_128-NEXT: fmov d0, x11
-; VBITS_EQ_128-NEXT: fmov d1, x10
-; VBITS_EQ_128-NEXT: fmov d2, x9
-; VBITS_EQ_128-NEXT: fmov d3, x8
-; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0]
-; VBITS_EQ_128-NEXT: mov v2.d[1], v3.d[0]
-; VBITS_EQ_128-NEXT: stp q0, q2, [x0]
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_256-LABEL: umulh_v4i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
+define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%1 = zext <4 x i64> %op1 to <4 x i128>
@@ -3083,60 +1196,69 @@ define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @umulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v8i64:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT: fmov x14, d0
-; VBITS_EQ_128-NEXT: mov x13, v0.d[1]
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x0]
-; VBITS_EQ_128-NEXT: mov x11, v1.d[1]
-; VBITS_EQ_128-NEXT: fmov x12, d1
-; VBITS_EQ_128-NEXT: mov x10, v2.d[1]
-; VBITS_EQ_128-NEXT: ldp q4, q5, [x1, #32]
-; VBITS_EQ_128-NEXT: mov x8, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x9, d3
-; VBITS_EQ_128-NEXT: fmov x17, d4
-; VBITS_EQ_128-NEXT: mov x15, v4.d[1]
-; VBITS_EQ_128-NEXT: ldp q3, q1, [x1]
-; VBITS_EQ_128-NEXT: fmov x1, d5
-; VBITS_EQ_128-NEXT: umulh x14, x14, x17
-; VBITS_EQ_128-NEXT: mov x18, v5.d[1]
-; VBITS_EQ_128-NEXT: umulh x13, x13, x15
-; VBITS_EQ_128-NEXT: fmov x15, d2
-; VBITS_EQ_128-NEXT: umulh x12, x12, x1
-; VBITS_EQ_128-NEXT: mov x1, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x17, d1
-; VBITS_EQ_128-NEXT: umulh x11, x11, x18
-; VBITS_EQ_128-NEXT: mov x16, v1.d[1]
-; VBITS_EQ_128-NEXT: fmov d2, x13
-; VBITS_EQ_128-NEXT: fmov d5, x12
-; VBITS_EQ_128-NEXT: umulh x9, x9, x17
-; VBITS_EQ_128-NEXT: fmov x17, d3
-; VBITS_EQ_128-NEXT: umulh x10, x10, x1
-; VBITS_EQ_128-NEXT: fmov d3, x14
-; VBITS_EQ_128-NEXT: umulh x8, x8, x16
-; VBITS_EQ_128-NEXT: fmov d4, x11
-; VBITS_EQ_128-NEXT: umulh x15, x15, x17
-; VBITS_EQ_128-NEXT: fmov d1, x9
-; VBITS_EQ_128-NEXT: fmov d6, x10
-; VBITS_EQ_128-NEXT: fmov d0, x8
-; VBITS_EQ_128-NEXT: fmov d7, x15
-; VBITS_EQ_128-NEXT: mov v3.d[1], v2.d[0]
-; VBITS_EQ_128-NEXT: mov v5.d[1], v4.d[0]
-; VBITS_EQ_128-NEXT: mov v7.d[1], v6.d[0]
-; VBITS_EQ_128-NEXT: mov v1.d[1], v0.d[0]
-; VBITS_EQ_128-NEXT: stp q3, q5, [x0, #32]
-; VBITS_EQ_128-NEXT: stp q7, q1, [x0]
-; VBITS_EQ_128-NEXT: ret
+; VBITS_GE_256-LABEL: umulh_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ptrue p1.d, vl2
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov x9, v0.d[1]
+; VBITS_GE_256-NEXT: fmov x10, d0
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: fmov x17, d2
+; VBITS_GE_256-NEXT: mov x13, v2.d[1]
+; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT: mov x14, v0.d[1]
+; VBITS_GE_256-NEXT: mov x18, v2.d[1]
+; VBITS_GE_256-NEXT: umulh x10, x10, x17
+; VBITS_GE_256-NEXT: mov x11, v1.d[1]
+; VBITS_GE_256-NEXT: fmov x12, d1
+; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT: mov x2, v3.d[1]
+; VBITS_GE_256-NEXT: fmov x3, d3
+; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
+; VBITS_GE_256-NEXT: umulh x9, x9, x13
+; VBITS_GE_256-NEXT: mov x13, v1.d[1]
+; VBITS_GE_256-NEXT: umulh x14, x14, x18
+; VBITS_GE_256-NEXT: mov x18, v3.d[1]
+; VBITS_GE_256-NEXT: umulh x12, x12, x3
+; VBITS_GE_256-NEXT: fmov x15, d0
+; VBITS_GE_256-NEXT: fmov x16, d1
+; VBITS_GE_256-NEXT: fmov x1, d2
+; VBITS_GE_256-NEXT: fmov x17, d3
+; VBITS_GE_256-NEXT: fmov d0, x9
+; VBITS_GE_256-NEXT: fmov d1, x10
+; VBITS_GE_256-NEXT: umulh x9, x11, x2
+; VBITS_GE_256-NEXT: umulh x15, x15, x1
+; VBITS_GE_256-NEXT: fmov d4, x12
+; VBITS_GE_256-NEXT: umulh x16, x16, x17
+; VBITS_GE_256-NEXT: umulh x10, x13, x18
+; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT: fmov d0, x14
+; VBITS_GE_256-NEXT: fmov d2, x15
+; VBITS_GE_256-NEXT: fmov d3, x9
+; VBITS_GE_256-NEXT: fmov d6, x16
+; VBITS_GE_256-NEXT: fmov d5, x10
+; VBITS_GE_256-NEXT: mov v2.d[1], v0.d[0]
+; VBITS_GE_256-NEXT: mov v4.d[1], v3.d[0]
+; VBITS_GE_256-NEXT: mov v6.d[1], v5.d[0]
+; VBITS_GE_256-NEXT: splice z1.d, p1, z1.d, z2.d
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: splice z4.d, p1, z4.d, z6.d
+; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; VBITS_GE_1024-LABEL: umulh_v8i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl8
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+; VBITS_GE_512-LABEL: umulh_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umulh z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%1 = zext <8 x i64> %op1 to <8 x i128>
@@ -3148,111 +1270,15 @@ define void @umulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v16i64:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: str x21, [sp, #-32]! // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 32
-; VBITS_EQ_128-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_offset w19, -8
-; VBITS_EQ_128-NEXT: .cfi_offset w20, -16
-; VBITS_EQ_128-NEXT: .cfi_offset w21, -32
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x0]
-; VBITS_EQ_128-NEXT: mov x10, v2.d[1]
-; VBITS_EQ_128-NEXT: fmov x11, d2
-; VBITS_EQ_128-NEXT: ldp q4, q5, [x0, #32]
-; VBITS_EQ_128-NEXT: mov x8, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x9, d3
-; VBITS_EQ_128-NEXT: mov x14, v4.d[1]
-; VBITS_EQ_128-NEXT: fmov x15, d4
-; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT: mov x12, v5.d[1]
-; VBITS_EQ_128-NEXT: fmov x13, d5
-; VBITS_EQ_128-NEXT: fmov x5, d0
-; VBITS_EQ_128-NEXT: mov x4, v0.d[1]
-; VBITS_EQ_128-NEXT: ldp q2, q3, [x0, #64]
-; VBITS_EQ_128-NEXT: mov x3, v1.d[1]
-; VBITS_EQ_128-NEXT: mov x18, v2.d[1]
-; VBITS_EQ_128-NEXT: fmov x2, d2
-; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #96]
-; VBITS_EQ_128-NEXT: mov x16, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x17, d3
-; VBITS_EQ_128-NEXT: fmov x19, d5
-; VBITS_EQ_128-NEXT: mov x6, v5.d[1]
-; VBITS_EQ_128-NEXT: ldp q4, q7, [x1, #64]
-; VBITS_EQ_128-NEXT: mov x20, v6.d[1]
-; VBITS_EQ_128-NEXT: fmov x21, d6
-; VBITS_EQ_128-NEXT: umulh x5, x5, x19
-; VBITS_EQ_128-NEXT: umulh x4, x4, x6
-; VBITS_EQ_128-NEXT: mov x19, v4.d[1]
-; VBITS_EQ_128-NEXT: fmov x6, d4
-; VBITS_EQ_128-NEXT: umulh x3, x3, x20
-; VBITS_EQ_128-NEXT: ldp q3, q16, [x1, #32]
-; VBITS_EQ_128-NEXT: fmov x20, d7
-; VBITS_EQ_128-NEXT: umulh x2, x2, x6
-; VBITS_EQ_128-NEXT: umulh x18, x18, x19
-; VBITS_EQ_128-NEXT: fmov d18, x4
-; VBITS_EQ_128-NEXT: fmov d19, x5
-; VBITS_EQ_128-NEXT: fmov d20, x3
-; VBITS_EQ_128-NEXT: umulh x17, x17, x20
-; VBITS_EQ_128-NEXT: fmov x19, d3
-; VBITS_EQ_128-NEXT: fmov d23, x2
-; VBITS_EQ_128-NEXT: ldp q2, q17, [x1]
-; VBITS_EQ_128-NEXT: fmov x1, d1
-; VBITS_EQ_128-NEXT: fmov x20, d16
-; VBITS_EQ_128-NEXT: umulh x15, x15, x19
-; VBITS_EQ_128-NEXT: fmov d22, x18
-; VBITS_EQ_128-NEXT: mov v19.d[1], v18.d[0]
-; VBITS_EQ_128-NEXT: umulh x1, x1, x21
-; VBITS_EQ_128-NEXT: mov x21, v7.d[1]
-; VBITS_EQ_128-NEXT: umulh x13, x13, x20
-; VBITS_EQ_128-NEXT: mov x7, v17.d[1]
-; VBITS_EQ_128-NEXT: mov x6, v2.d[1]
-; VBITS_EQ_128-NEXT: mov x20, v16.d[1]
-; VBITS_EQ_128-NEXT: umulh x16, x16, x21
-; VBITS_EQ_128-NEXT: fmov x21, d2
-; VBITS_EQ_128-NEXT: fmov x19, d17
-; VBITS_EQ_128-NEXT: umulh x8, x8, x7
-; VBITS_EQ_128-NEXT: umulh x10, x10, x6
-; VBITS_EQ_128-NEXT: fmov d5, x13
-; VBITS_EQ_128-NEXT: umulh x11, x11, x21
-; VBITS_EQ_128-NEXT: fmov d7, x15
-; VBITS_EQ_128-NEXT: mov x21, v3.d[1]
-; VBITS_EQ_128-NEXT: umulh x9, x9, x19
-; VBITS_EQ_128-NEXT: umulh x12, x12, x20
-; VBITS_EQ_128-NEXT: fmov d0, x8
-; VBITS_EQ_128-NEXT: fmov d2, x10
-; VBITS_EQ_128-NEXT: fmov d16, x16
-; VBITS_EQ_128-NEXT: fmov d3, x11
-; VBITS_EQ_128-NEXT: fmov d17, x17
-; VBITS_EQ_128-NEXT: umulh x14, x14, x21
-; VBITS_EQ_128-NEXT: fmov d1, x9
-; VBITS_EQ_128-NEXT: fmov d4, x12
-; VBITS_EQ_128-NEXT: fmov d21, x1
-; VBITS_EQ_128-NEXT: mov v23.d[1], v22.d[0]
-; VBITS_EQ_128-NEXT: mov v17.d[1], v16.d[0]
-; VBITS_EQ_128-NEXT: fmov d6, x14
-; VBITS_EQ_128-NEXT: mov v21.d[1], v20.d[0]
-; VBITS_EQ_128-NEXT: mov v5.d[1], v4.d[0]
-; VBITS_EQ_128-NEXT: mov v7.d[1], v6.d[0]
-; VBITS_EQ_128-NEXT: stp q23, q17, [x0, #64]
-; VBITS_EQ_128-NEXT: mov v3.d[1], v2.d[0]
-; VBITS_EQ_128-NEXT: mov v1.d[1], v0.d[0]
-; VBITS_EQ_128-NEXT: stp q19, q21, [x0, #96]
-; VBITS_EQ_128-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #32]
-; VBITS_EQ_128-NEXT: stp q3, q1, [x0]
-; VBITS_EQ_128-NEXT: ldr x21, [sp], #32 // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: umulh_v16i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: umulh_v16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%1 = zext <16 x i64> %op1 to <16 x i128>
@@ -3264,237 +1290,15 @@ define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v32i64:
-; VBITS_EQ_128: // %bb.0:
-; VBITS_EQ_128-NEXT: sub sp, sp, #224
-; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 224
-; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #80] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #96] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #112] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp x29, x30, [sp, #128] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp x28, x27, [sp, #144] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp x26, x25, [sp, #160] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp x24, x23, [sp, #176] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp x22, x21, [sp, #192] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: stp x20, x19, [sp, #208] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: .cfi_offset w19, -8
-; VBITS_EQ_128-NEXT: .cfi_offset w20, -16
-; VBITS_EQ_128-NEXT: .cfi_offset w21, -24
-; VBITS_EQ_128-NEXT: .cfi_offset w22, -32
-; VBITS_EQ_128-NEXT: .cfi_offset w23, -40
-; VBITS_EQ_128-NEXT: .cfi_offset w24, -48
-; VBITS_EQ_128-NEXT: .cfi_offset w25, -56
-; VBITS_EQ_128-NEXT: .cfi_offset w26, -64
-; VBITS_EQ_128-NEXT: .cfi_offset w27, -72
-; VBITS_EQ_128-NEXT: .cfi_offset w28, -80
-; VBITS_EQ_128-NEXT: .cfi_offset w30, -88
-; VBITS_EQ_128-NEXT: .cfi_offset w29, -96
-; VBITS_EQ_128-NEXT: .cfi_offset b8, -104
-; VBITS_EQ_128-NEXT: .cfi_offset b9, -112
-; VBITS_EQ_128-NEXT: .cfi_offset b10, -120
-; VBITS_EQ_128-NEXT: .cfi_offset b11, -128
-; VBITS_EQ_128-NEXT: .cfi_offset b12, -136
-; VBITS_EQ_128-NEXT: .cfi_offset b13, -144
-; VBITS_EQ_128-NEXT: .cfi_offset b14, -152
-; VBITS_EQ_128-NEXT: .cfi_offset b15, -160
-; VBITS_EQ_128-NEXT: ldp q3, q2, [x0]
-; VBITS_EQ_128-NEXT: mov x8, v3.d[1]
-; VBITS_EQ_128-NEXT: ldp q5, q4, [x0, #64]
-; VBITS_EQ_128-NEXT: fmov x2, d2
-; VBITS_EQ_128-NEXT: str x8, [sp, #16] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: fmov x8, d3
-; VBITS_EQ_128-NEXT: mov x6, v5.d[1]
-; VBITS_EQ_128-NEXT: fmov x7, d5
-; VBITS_EQ_128-NEXT: str x8, [sp] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: ldp q6, q3, [x0, #96]
-; VBITS_EQ_128-NEXT: mov x20, v4.d[1]
-; VBITS_EQ_128-NEXT: fmov x21, d4
-; VBITS_EQ_128-NEXT: mov x23, v6.d[1]
-; VBITS_EQ_128-NEXT: fmov x24, d6
-; VBITS_EQ_128-NEXT: ldp q16, q4, [x0, #128]
-; VBITS_EQ_128-NEXT: mov x26, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x27, d3
-; VBITS_EQ_128-NEXT: mov x28, v16.d[1]
-; VBITS_EQ_128-NEXT: fmov x25, d16
-; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #224]
-; VBITS_EQ_128-NEXT: mov x22, v4.d[1]
-; VBITS_EQ_128-NEXT: fmov x19, d4
-; VBITS_EQ_128-NEXT: mov x13, v7.d[1]
-; VBITS_EQ_128-NEXT: fmov x11, d7
-; VBITS_EQ_128-NEXT: ldp q17, q6, [x0, #192]
-; VBITS_EQ_128-NEXT: mov x12, v5.d[1]
-; VBITS_EQ_128-NEXT: fmov x10, d5
-; VBITS_EQ_128-NEXT: mov x17, v17.d[1]
-; VBITS_EQ_128-NEXT: fmov x16, d17
-; VBITS_EQ_128-NEXT: ldp q18, q3, [x0, #160]
-; VBITS_EQ_128-NEXT: mov x15, v6.d[1]
-; VBITS_EQ_128-NEXT: fmov x14, d6
-; VBITS_EQ_128-NEXT: mov x5, v18.d[1]
-; VBITS_EQ_128-NEXT: fmov x4, d18
-; VBITS_EQ_128-NEXT: ldp q19, q16, [x1, #224]
-; VBITS_EQ_128-NEXT: mov x29, v3.d[1]
-; VBITS_EQ_128-NEXT: fmov x18, d3
-; VBITS_EQ_128-NEXT: fmov x8, d19
-; VBITS_EQ_128-NEXT: mov x9, v19.d[1]
-; VBITS_EQ_128-NEXT: ldp q21, q20, [x1, #192]
-; VBITS_EQ_128-NEXT: mov x30, v16.d[1]
-; VBITS_EQ_128-NEXT: umulh x8, x11, x8
-; VBITS_EQ_128-NEXT: umulh x11, x13, x9
-; VBITS_EQ_128-NEXT: fmov x9, d21
-; VBITS_EQ_128-NEXT: str x8, [sp, #48] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: ldp q22, q18, [x1, #160]
-; VBITS_EQ_128-NEXT: ldp q24, q23, [x1, #128]
-; VBITS_EQ_128-NEXT: ldp q25, q17, [x1, #96]
-; VBITS_EQ_128-NEXT: ldp q26, q6, [x1, #64]
-; VBITS_EQ_128-NEXT: ldp q4, q3, [x1, #32]
-; VBITS_EQ_128-NEXT: ldp q7, q5, [x1]
-; VBITS_EQ_128-NEXT: fmov x1, d16
-; VBITS_EQ_128-NEXT: umulh x10, x10, x1
-; VBITS_EQ_128-NEXT: mov x1, v20.d[1]
-; VBITS_EQ_128-NEXT: ldp q1, q0, [x0, #32]
-; VBITS_EQ_128-NEXT: str x10, [sp, #56] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: umulh x10, x12, x30
-; VBITS_EQ_128-NEXT: mov x30, v21.d[1]
-; VBITS_EQ_128-NEXT: fmov x3, d1
-; VBITS_EQ_128-NEXT: str x10, [sp, #24] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: fmov x10, d20
-; VBITS_EQ_128-NEXT: ldr x13, [sp, #16] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d13, d11, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: umulh x8, x14, x10
-; VBITS_EQ_128-NEXT: umulh x10, x15, x1
-; VBITS_EQ_128-NEXT: fmov x15, d18
-; VBITS_EQ_128-NEXT: umulh x14, x16, x9
-; VBITS_EQ_128-NEXT: mov x9, v22.d[1]
-; VBITS_EQ_128-NEXT: umulh x16, x17, x30
-; VBITS_EQ_128-NEXT: stp x11, x8, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT: fmov x17, d22
-; VBITS_EQ_128-NEXT: mov x8, v18.d[1]
-; VBITS_EQ_128-NEXT: umulh x18, x18, x15
-; VBITS_EQ_128-NEXT: mov x15, v23.d[1]
-; VBITS_EQ_128-NEXT: str x10, [sp, #8] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT: umulh x4, x4, x17
-; VBITS_EQ_128-NEXT: fmov d8, x16
-; VBITS_EQ_128-NEXT: mov x17, v24.d[1]
-; VBITS_EQ_128-NEXT: umulh x5, x5, x9
-; VBITS_EQ_128-NEXT: umulh x1, x29, x8
-; VBITS_EQ_128-NEXT: fmov x8, d23
-; VBITS_EQ_128-NEXT: fmov x9, d24
-; VBITS_EQ_128-NEXT: umulh x22, x22, x15
-; VBITS_EQ_128-NEXT: fmov x15, d17
-; VBITS_EQ_128-NEXT: fmov d9, x14
-; VBITS_EQ_128-NEXT: umulh x19, x19, x8
-; VBITS_EQ_128-NEXT: ldr d14, [sp, #8] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: mov x8, v17.d[1]
-; VBITS_EQ_128-NEXT: umulh x25, x25, x9
-; VBITS_EQ_128-NEXT: mov x9, v25.d[1]
-; VBITS_EQ_128-NEXT: umulh x28, x28, x17
-; VBITS_EQ_128-NEXT: fmov x17, d25
-; VBITS_EQ_128-NEXT: umulh x15, x27, x15
-; VBITS_EQ_128-NEXT: mov x27, v6.d[1]
-; VBITS_EQ_128-NEXT: ldr d15, [sp, #40] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: umulh x12, x26, x8
-; VBITS_EQ_128-NEXT: fmov x26, d6
-; VBITS_EQ_128-NEXT: umulh x17, x24, x17
-; VBITS_EQ_128-NEXT: ldr x8, [sp] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT: mov x24, v26.d[1]
-; VBITS_EQ_128-NEXT: umulh x11, x23, x9
-; VBITS_EQ_128-NEXT: fmov x23, d26
-; VBITS_EQ_128-NEXT: umulh x21, x21, x26
-; VBITS_EQ_128-NEXT: fmov x26, d0
-; VBITS_EQ_128-NEXT: umulh x20, x20, x27
-; VBITS_EQ_128-NEXT: fmov x27, d3
-; VBITS_EQ_128-NEXT: fmov d20, x17
-; VBITS_EQ_128-NEXT: umulh x7, x7, x23
-; VBITS_EQ_128-NEXT: fmov x23, d4
-; VBITS_EQ_128-NEXT: umulh x6, x6, x24
-; VBITS_EQ_128-NEXT: fmov x24, d5
-; VBITS_EQ_128-NEXT: umulh x26, x26, x27
-; VBITS_EQ_128-NEXT: fmov x27, d7
-; VBITS_EQ_128-NEXT: umulh x3, x3, x23
-; VBITS_EQ_128-NEXT: fmov d19, x20
-; VBITS_EQ_128-NEXT: mov x23, v2.d[1]
-; VBITS_EQ_128-NEXT: umulh x2, x2, x24
-; VBITS_EQ_128-NEXT: mov x24, v1.d[1]
-; VBITS_EQ_128-NEXT: umulh x27, x8, x27
-; VBITS_EQ_128-NEXT: mov x29, v0.d[1]
-; VBITS_EQ_128-NEXT: mov x30, v7.d[1]
-; VBITS_EQ_128-NEXT: mov x8, v5.d[1]
-; VBITS_EQ_128-NEXT: mov x9, v4.d[1]
-; VBITS_EQ_128-NEXT: mov x10, v3.d[1]
-; VBITS_EQ_128-NEXT: ldp d10, d12, [sp, #24] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: umulh x30, x13, x30
-; VBITS_EQ_128-NEXT: fmov d0, x27
-; VBITS_EQ_128-NEXT: umulh x8, x23, x8
-; VBITS_EQ_128-NEXT: fmov d2, x2
-; VBITS_EQ_128-NEXT: umulh x9, x24, x9
-; VBITS_EQ_128-NEXT: fmov d4, x3
-; VBITS_EQ_128-NEXT: umulh x10, x29, x10
-; VBITS_EQ_128-NEXT: fmov d6, x26
-; VBITS_EQ_128-NEXT: mov v11.d[1], v10.d[0]
-; VBITS_EQ_128-NEXT: fmov d1, x30
-; VBITS_EQ_128-NEXT: mov v13.d[1], v12.d[0]
-; VBITS_EQ_128-NEXT: mov v15.d[1], v14.d[0]
-; VBITS_EQ_128-NEXT: mov v9.d[1], v8.d[0]
-; VBITS_EQ_128-NEXT: fmov d3, x8
-; VBITS_EQ_128-NEXT: fmov d5, x9
-; VBITS_EQ_128-NEXT: fmov d7, x10
-; VBITS_EQ_128-NEXT: fmov d17, x6
-; VBITS_EQ_128-NEXT: fmov d16, x7
-; VBITS_EQ_128-NEXT: fmov d18, x21
-; VBITS_EQ_128-NEXT: fmov d21, x11
-; VBITS_EQ_128-NEXT: fmov d22, x12
-; VBITS_EQ_128-NEXT: fmov d23, x15
-; VBITS_EQ_128-NEXT: fmov d24, x28
-; VBITS_EQ_128-NEXT: fmov d25, x25
-; VBITS_EQ_128-NEXT: fmov d26, x22
-; VBITS_EQ_128-NEXT: fmov d27, x19
-; VBITS_EQ_128-NEXT: fmov d28, x5
-; VBITS_EQ_128-NEXT: fmov d29, x4
-; VBITS_EQ_128-NEXT: fmov d30, x1
-; VBITS_EQ_128-NEXT: fmov d31, x18
-; VBITS_EQ_128-NEXT: mov v27.d[1], v26.d[0]
-; VBITS_EQ_128-NEXT: stp q9, q15, [x0, #192]
-; VBITS_EQ_128-NEXT: stp q13, q11, [x0, #224]
-; VBITS_EQ_128-NEXT: mov v31.d[1], v30.d[0]
-; VBITS_EQ_128-NEXT: mov v29.d[1], v28.d[0]
-; VBITS_EQ_128-NEXT: mov v25.d[1], v24.d[0]
-; VBITS_EQ_128-NEXT: mov v23.d[1], v22.d[0]
-; VBITS_EQ_128-NEXT: mov v20.d[1], v21.d[0]
-; VBITS_EQ_128-NEXT: mov v18.d[1], v19.d[0]
-; VBITS_EQ_128-NEXT: stp q29, q31, [x0, #160]
-; VBITS_EQ_128-NEXT: mov v16.d[1], v17.d[0]
-; VBITS_EQ_128-NEXT: stp q25, q27, [x0, #128]
-; VBITS_EQ_128-NEXT: mov v6.d[1], v7.d[0]
-; VBITS_EQ_128-NEXT: mov v4.d[1], v5.d[0]
-; VBITS_EQ_128-NEXT: stp q20, q23, [x0, #96]
-; VBITS_EQ_128-NEXT: mov v2.d[1], v3.d[0]
-; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0]
-; VBITS_EQ_128-NEXT: stp q16, q18, [x0, #64]
-; VBITS_EQ_128-NEXT: ldp x20, x19, [sp, #208] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: stp q4, q6, [x0, #32]
-; VBITS_EQ_128-NEXT: ldp x22, x21, [sp, #192] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: stp q0, q2, [x0]
-; VBITS_EQ_128-NEXT: ldp x24, x23, [sp, #176] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp x26, x25, [sp, #160] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp x28, x27, [sp, #144] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp x29, x30, [sp, #128] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #112] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #96] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #80] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT: add sp, sp, #224
-; VBITS_EQ_128-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: umulh_v32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: umulh_v32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%1 = zext <32 x i64> %op1 to <32 x i128>
@@ -3506,5 +1310,3 @@ define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
ret void
}
attributes #0 = { "target-features"="+sve" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; VBITS_GE_512: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
index 9ce3873af774..1e6684b9f0e7 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
@@ -1,328 +1,364 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; UADDV
;
; Don't use SVE for 64-bit vectors.
-define i8 @uaddv_v8i8(<8 x i8> %a) #0 {
+define i8 @uaddv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uaddv_v8i8:
-; CHECK: addv b0, v0.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
ret i8 %res
}
; Don't use SVE for 128-bit vectors.
-define i8 @uaddv_v16i8(<16 x i8> %a) #0 {
+define i8 @uaddv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uaddv_v16i8:
-; CHECK: addv b0, v0.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: addv b0, v0.16b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
ret i8 %res
}
-define i8 @uaddv_v32i8(<32 x i8>* %a) #0 {
+define i8 @uaddv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uaddv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: uaddv d0, p0, z0.b
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
ret i8 %res
}
define i8 @uaddv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: uaddv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].b, [[LO]].b, [[HI]].b
-; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].b
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uaddv_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: add z0.b, z1.b, z0.b
+; VBITS_GE_256-NEXT: uaddv d0, p0, z0.b
+; VBITS_GE_256-NEXT: fmov x0, d0
+; VBITS_GE_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: uaddv_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: uaddv d0, p0, z0.b
+; VBITS_GE_512-NEXT: fmov x0, d0
+; VBITS_GE_512-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op)
ret i8 %res
}
-define i8 @uaddv_v128i8(<128 x i8>* %a) #0 {
+define i8 @uaddv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: uaddv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: uaddv d0, p0, z0.b
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op)
ret i8 %res
}
-define i8 @uaddv_v256i8(<256 x i8>* %a) #0 {
+define i8 @uaddv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: uaddv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: uaddv d0, p0, z0.b
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op)
ret i8 %res
}
; Don't use SVE for 64-bit vectors.
-define i16 @uaddv_v4i16(<4 x i16> %a) #0 {
+define i16 @uaddv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uaddv_v4i16:
-; CHECK: addv h0, v0.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: addv h0, v0.4h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
ret i16 %res
}
; Don't use SVE for 128-bit vectors.
-define i16 @uaddv_v8i16(<8 x i16> %a) #0 {
+define i16 @uaddv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uaddv_v8i16:
-; CHECK: addv h0, v0.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
ret i16 %res
}
-define i16 @uaddv_v16i16(<16 x i16>* %a) #0 {
+define i16 @uaddv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uaddv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: uaddv d0, p0, z0.h
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
ret i16 %res
}
define i16 @uaddv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: uaddv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].h, [[LO]].h, [[HI]].h
-; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].h
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uaddv_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: add z0.h, z1.h, z0.h
+; VBITS_GE_256-NEXT: uaddv d0, p0, z0.h
+; VBITS_GE_256-NEXT: fmov x0, d0
+; VBITS_GE_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: uaddv_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: uaddv d0, p0, z0.h
+; VBITS_GE_512-NEXT: fmov x0, d0
+; VBITS_GE_512-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op)
ret i16 %res
}
-define i16 @uaddv_v64i16(<64 x i16>* %a) #0 {
+define i16 @uaddv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: uaddv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: uaddv d0, p0, z0.h
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op)
ret i16 %res
}
-define i16 @uaddv_v128i16(<128 x i16>* %a) #0 {
+define i16 @uaddv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: uaddv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: uaddv d0, p0, z0.h
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op)
ret i16 %res
}
; Don't use SVE for 64-bit vectors.
-define i32 @uaddv_v2i32(<2 x i32> %a) #0 {
+define i32 @uaddv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uaddv_v2i32:
-; CHECK: addp v0.2s, v0.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
ret i32 %res
}
; Don't use SVE for 128-bit vectors.
-define i32 @uaddv_v4i32(<4 x i32> %a) #0 {
+define i32 @uaddv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uaddv_v4i32:
-; CHECK: addv s0, v0.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
ret i32 %res
}
-define i32 @uaddv_v8i32(<8 x i32>* %a) #0 {
+define i32 @uaddv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uaddv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
ret i32 %res
}
define i32 @uaddv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: uaddv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].s, [[LO]].s, [[HI]].s
-; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].s
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uaddv_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: add z0.s, z1.s, z0.s
+; VBITS_GE_256-NEXT: uaddv d0, p0, z0.s
+; VBITS_GE_256-NEXT: fmov x0, d0
+; VBITS_GE_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: uaddv_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: uaddv d0, p0, z0.s
+; VBITS_GE_512-NEXT: fmov x0, d0
+; VBITS_GE_512-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op)
ret i32 %res
}
-define i32 @uaddv_v32i32(<32 x i32>* %a) #0 {
+define i32 @uaddv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: uaddv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op)
ret i32 %res
}
-define i32 @uaddv_v64i32(<64 x i32>* %a) #0 {
+define i32 @uaddv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: uaddv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op)
ret i32 %res
}
; Nothing to do for single element vectors.
-define i64 @uaddv_v1i64(<1 x i64> %a) #0 {
+define i64 @uaddv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uaddv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
ret i64 %res
}
; Don't use SVE for 128-bit vectors.
-define i64 @uaddv_v2i64(<2 x i64> %a) #0 {
+define i64 @uaddv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uaddv_v2i64:
-; CHECK: addp d0, v0.2d
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
ret i64 %res
}
-define i64 @uaddv_v4i64(<4 x i64>* %a) #0 {
+define i64 @uaddv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uaddv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: uaddv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
ret i64 %res
}
define i64 @uaddv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: uaddv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uaddv_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: add z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: uaddv d0, p0, z0.d
+; VBITS_GE_256-NEXT: fmov x0, d0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: uaddv_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: uaddv d0, p0, z0.d
+; VBITS_GE_512-NEXT: fmov x0, d0
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %op)
ret i64 %res
}
-define i64 @uaddv_v16i64(<16 x i64>* %a) #0 {
+define i64 @uaddv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: uaddv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: uaddv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %op)
ret i64 %res
}
-define i64 @uaddv_v32i64(<32 x i64>* %a) #0 {
+define i64 @uaddv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: uaddv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: uaddv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %op)
ret i64 %res
@@ -333,306 +369,342 @@ define i64 @uaddv_v32i64(<32 x i64>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define i8 @smaxv_v8i8(<8 x i8> %a) #0 {
+define i8 @smaxv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: smaxv_v8i8:
-; CHECK: smaxv b0, v0.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smaxv b0, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
ret i8 %res
}
; Don't use SVE for 128-bit vectors.
-define i8 @smaxv_v16i8(<16 x i8> %a) #0 {
+define i8 @smaxv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: smaxv_v16i8:
-; CHECK: smaxv b0, v0.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smaxv b0, v0.16b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
ret i8 %res
}
-define i8 @smaxv_v32i8(<32 x i8>* %a) #0 {
+define i8 @smaxv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: smaxv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: smaxv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
ret i8 %res
}
define i8 @smaxv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: smaxv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
-; VBITS_EQ_256-DAG: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[MAX]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smaxv_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: smax z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT: smaxv b0, p0, z0.b
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: smaxv_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: smaxv b0, p0, z0.b
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %op)
ret i8 %res
}
-define i8 @smaxv_v128i8(<128 x i8>* %a) #0 {
+define i8 @smaxv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: smaxv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: smaxv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %op)
ret i8 %res
}
-define i8 @smaxv_v256i8(<256 x i8>* %a) #0 {
+define i8 @smaxv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: smaxv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: smaxv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %op)
ret i8 %res
}
; Don't use SVE for 64-bit vectors.
-define i16 @smaxv_v4i16(<4 x i16> %a) #0 {
+define i16 @smaxv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: smaxv_v4i16:
-; CHECK: smaxv h0, v0.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smaxv h0, v0.4h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
ret i16 %res
}
; Don't use SVE for 128-bit vectors.
-define i16 @smaxv_v8i16(<8 x i16> %a) #0 {
+define i16 @smaxv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: smaxv_v8i16:
-; CHECK: smaxv h0, v0.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smaxv h0, v0.8h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
ret i16 %res
}
-define i16 @smaxv_v16i16(<16 x i16>* %a) #0 {
+define i16 @smaxv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: smaxv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: smaxv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
ret i16 %res
}
define i16 @smaxv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: smaxv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[MAX]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smaxv_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: smax z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT: smaxv h0, p0, z0.h
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: smaxv_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: smaxv h0, p0, z0.h
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %op)
ret i16 %res
}
-define i16 @smaxv_v64i16(<64 x i16>* %a) #0 {
+define i16 @smaxv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: smaxv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: smaxv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %op)
ret i16 %res
}
-define i16 @smaxv_v128i16(<128 x i16>* %a) #0 {
+define i16 @smaxv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: smaxv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: smaxv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %op)
ret i16 %res
}
; Don't use SVE for 64-bit vectors.
-define i32 @smaxv_v2i32(<2 x i32> %a) #0 {
+define i32 @smaxv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: smaxv_v2i32:
-; CHECK: smaxp v0.2s, v0.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smaxp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
ret i32 %res
}
; Don't use SVE for 128-bit vectors.
-define i32 @smaxv_v4i32(<4 x i32> %a) #0 {
+define i32 @smaxv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: smaxv_v4i32:
-; CHECK: smaxv s0, v0.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: smaxv s0, v0.4s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
ret i32 %res
}
-define i32 @smaxv_v8i32(<8 x i32>* %a) #0 {
+define i32 @smaxv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: smaxv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: smaxv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
ret i32 %res
}
define i32 @smaxv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: smaxv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[MAX]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smaxv_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: smax z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: smaxv s0, p0, z0.s
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: smaxv_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: smaxv s0, p0, z0.s
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %op)
ret i32 %res
}
-define i32 @smaxv_v32i32(<32 x i32>* %a) #0 {
+define i32 @smaxv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: smaxv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: smaxv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %op)
ret i32 %res
}
-define i32 @smaxv_v64i32(<64 x i32>* %a) #0 {
+define i32 @smaxv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: smaxv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: smaxv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %op)
ret i32 %res
}
; Nothing to do for single element vectors.
-define i64 @smaxv_v1i64(<1 x i64> %a) #0 {
+define i64 @smaxv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: smaxv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %a)
ret i64 %res
}
; No NEON 64-bit vector SMAXV support. Use SVE.
-define i64 @smaxv_v2i64(<2 x i64> %a) #0 {
+define i64 @smaxv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: smaxv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: smaxv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
ret i64 %res
}
-define i64 @smaxv_v4i64(<4 x i64>* %a) #0 {
+define i64 @smaxv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: smaxv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: smaxv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
ret i64 %res
}
define i64 @smaxv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: smaxv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[MAX]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smaxv_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: smax z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_256-NEXT: smaxv d0, p0, z0.d
+; VBITS_GE_256-NEXT: fmov x0, d0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: smaxv_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: smaxv d0, p0, z0.d
+; VBITS_GE_512-NEXT: fmov x0, d0
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %op)
ret i64 %res
}
-define i64 @smaxv_v16i64(<16 x i64>* %a) #0 {
+define i64 @smaxv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: smaxv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: smaxv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %op)
ret i64 %res
}
-define i64 @smaxv_v32i64(<32 x i64>* %a) #0 {
+define i64 @smaxv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: smaxv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: smaxv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %op)
ret i64 %res
@@ -643,306 +715,342 @@ define i64 @smaxv_v32i64(<32 x i64>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define i8 @sminv_v8i8(<8 x i8> %a) #0 {
+define i8 @sminv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sminv_v8i8:
-; CHECK: sminv b0, v0.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sminv b0, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
ret i8 %res
}
; Don't use SVE for 128-bit vectors.
-define i8 @sminv_v16i8(<16 x i8> %a) #0 {
+define i8 @sminv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sminv_v16i8:
-; CHECK: sminv b0, v0.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sminv b0, v0.16b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
ret i8 %res
}
-define i8 @sminv_v32i8(<32 x i8>* %a) #0 {
+define i8 @sminv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sminv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: sminv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
ret i8 %res
}
define i8 @sminv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: sminv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
-; VBITS_EQ_256-DAG: sminv b[[REDUCE:[0-9]+]], [[PG]], [[MIN]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: sminv_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: smin z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT: sminv b0, p0, z0.b
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sminv_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: sminv b0, p0, z0.b
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %op)
ret i8 %res
}
-define i8 @sminv_v128i8(<128 x i8>* %a) #0 {
+define i8 @sminv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: sminv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: sminv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %op)
ret i8 %res
}
-define i8 @sminv_v256i8(<256 x i8>* %a) #0 {
+define i8 @sminv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: sminv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: sminv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %op)
ret i8 %res
}
; Don't use SVE for 64-bit vectors.
-define i16 @sminv_v4i16(<4 x i16> %a) #0 {
+define i16 @sminv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sminv_v4i16:
-; CHECK: sminv h0, v0.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sminv h0, v0.4h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
ret i16 %res
}
; Don't use SVE for 128-bit vectors.
-define i16 @sminv_v8i16(<8 x i16> %a) #0 {
+define i16 @sminv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sminv_v8i16:
-; CHECK: sminv h0, v0.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sminv h0, v0.8h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
ret i16 %res
}
-define i16 @sminv_v16i16(<16 x i16>* %a) #0 {
+define i16 @sminv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sminv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: sminv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
ret i16 %res
}
define i16 @sminv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: sminv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: sminv h[[REDUCE:[0-9]+]], [[PG]], [[MIN]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: sminv_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: smin z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT: sminv h0, p0, z0.h
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sminv_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: sminv h0, p0, z0.h
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %op)
ret i16 %res
}
-define i16 @sminv_v64i16(<64 x i16>* %a) #0 {
+define i16 @sminv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: sminv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: sminv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %op)
ret i16 %res
}
-define i16 @sminv_v128i16(<128 x i16>* %a) #0 {
+define i16 @sminv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: sminv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: sminv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %op)
ret i16 %res
}
; Don't use SVE for 64-bit vectors.
-define i32 @sminv_v2i32(<2 x i32> %a) #0 {
+define i32 @sminv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sminv_v2i32:
-; CHECK: minp v0.2s, v0.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sminp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
ret i32 %res
}
; Don't use SVE for 128-bit vectors.
-define i32 @sminv_v4i32(<4 x i32> %a) #0 {
+define i32 @sminv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sminv_v4i32:
-; CHECK: sminv s0, v0.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sminv s0, v0.4s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
ret i32 %res
}
-define i32 @sminv_v8i32(<8 x i32>* %a) #0 {
+define i32 @sminv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sminv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: sminv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
ret i32 %res
}
define i32 @sminv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: sminv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: sminv [[REDUCE:s[0-9]+]], [[PG]], [[MIN]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: sminv_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: smin z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: sminv s0, p0, z0.s
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sminv_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: sminv s0, p0, z0.s
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %op)
ret i32 %res
}
-define i32 @sminv_v32i32(<32 x i32>* %a) #0 {
+define i32 @sminv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: sminv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: sminv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %op)
ret i32 %res
}
-define i32 @sminv_v64i32(<64 x i32>* %a) #0 {
+define i32 @sminv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: sminv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: sminv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %op)
ret i32 %res
}
; Nothing to do for single element vectors.
-define i64 @sminv_v1i64(<1 x i64> %a) #0 {
+define i64 @sminv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sminv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %a)
ret i64 %res
}
; No NEON 64-bit vector SMINV support. Use SVE.
-define i64 @sminv_v2i64(<2 x i64> %a) #0 {
+define i64 @sminv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sminv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: sminv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
ret i64 %res
}
-define i64 @sminv_v4i64(<4 x i64>* %a) #0 {
+define i64 @sminv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sminv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: sminv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
ret i64 %res
}
define i64 @sminv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: sminv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: sminv [[REDUCE:d[0-9]+]], [[PG]], [[MIN]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: sminv_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: smin z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_256-NEXT: sminv d0, p0, z0.d
+; VBITS_GE_256-NEXT: fmov x0, d0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: sminv_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: sminv d0, p0, z0.d
+; VBITS_GE_512-NEXT: fmov x0, d0
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %op)
ret i64 %res
}
-define i64 @sminv_v16i64(<16 x i64>* %a) #0 {
+define i64 @sminv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: sminv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: sminv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %op)
ret i64 %res
}
-define i64 @sminv_v32i64(<32 x i64>* %a) #0 {
+define i64 @sminv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: sminv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: sminv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %op)
ret i64 %res
@@ -953,306 +1061,342 @@ define i64 @sminv_v32i64(<32 x i64>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define i8 @umaxv_v8i8(<8 x i8> %a) #0 {
+define i8 @umaxv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: umaxv_v8i8:
-; CHECK: umaxv b0, v0.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umaxv b0, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
ret i8 %res
}
; Don't use SVE for 128-bit vectors.
-define i8 @umaxv_v16i8(<16 x i8> %a) #0 {
+define i8 @umaxv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: umaxv_v16i8:
-; CHECK: umaxv b0, v0.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umaxv b0, v0.16b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
ret i8 %res
}
-define i8 @umaxv_v32i8(<32 x i8>* %a) #0 {
+define i8 @umaxv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: umaxv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: umaxv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
ret i8 %res
}
define i8 @umaxv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: umaxv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
-; VBITS_EQ_256-DAG: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[MAX]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umaxv_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: umax z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT: umaxv b0, p0, z0.b
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: umaxv_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: umaxv b0, p0, z0.b
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %op)
ret i8 %res
}
-define i8 @umaxv_v128i8(<128 x i8>* %a) #0 {
+define i8 @umaxv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: umaxv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: umaxv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %op)
ret i8 %res
}
-define i8 @umaxv_v256i8(<256 x i8>* %a) #0 {
+define i8 @umaxv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: umaxv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: umaxv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %op)
ret i8 %res
}
; Don't use SVE for 64-bit vectors.
-define i16 @umaxv_v4i16(<4 x i16> %a) #0 {
+define i16 @umaxv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: umaxv_v4i16:
-; CHECK: umaxv h0, v0.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umaxv h0, v0.4h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
ret i16 %res
}
; Don't use SVE for 128-bit vectors.
-define i16 @umaxv_v8i16(<8 x i16> %a) #0 {
+define i16 @umaxv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: umaxv_v8i16:
-; CHECK: umaxv h0, v0.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umaxv h0, v0.8h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
ret i16 %res
}
-define i16 @umaxv_v16i16(<16 x i16>* %a) #0 {
+define i16 @umaxv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: umaxv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: umaxv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
ret i16 %res
}
define i16 @umaxv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: umaxv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[MAX]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umaxv_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: umax z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT: umaxv h0, p0, z0.h
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: umaxv_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: umaxv h0, p0, z0.h
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %op)
ret i16 %res
}
-define i16 @umaxv_v64i16(<64 x i16>* %a) #0 {
+define i16 @umaxv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: umaxv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: umaxv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %op)
ret i16 %res
}
-define i16 @umaxv_v128i16(<128 x i16>* %a) #0 {
+define i16 @umaxv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: umaxv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: umaxv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %op)
ret i16 %res
}
; Don't use SVE for 64-bit vectors.
-define i32 @umaxv_v2i32(<2 x i32> %a) #0 {
+define i32 @umaxv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: umaxv_v2i32:
-; CHECK: umaxp v0.2s, v0.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
ret i32 %res
}
; Don't use SVE for 128-bit vectors.
-define i32 @umaxv_v4i32(<4 x i32> %a) #0 {
+define i32 @umaxv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: umaxv_v4i32:
-; CHECK: umaxv s0, v0.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: umaxv s0, v0.4s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
ret i32 %res
}
-define i32 @umaxv_v8i32(<8 x i32>* %a) #0 {
+define i32 @umaxv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: umaxv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: umaxv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
ret i32 %res
}
define i32 @umaxv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: umaxv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[MAX]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umaxv_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: umax z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: umaxv s0, p0, z0.s
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: umaxv_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: umaxv s0, p0, z0.s
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %op)
ret i32 %res
}
-define i32 @umaxv_v32i32(<32 x i32>* %a) #0 {
+define i32 @umaxv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: umaxv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: umaxv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %op)
ret i32 %res
}
-define i32 @umaxv_v64i32(<64 x i32>* %a) #0 {
+define i32 @umaxv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: umaxv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: umaxv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %op)
ret i32 %res
}
; Nothing to do for single element vectors.
-define i64 @umaxv_v1i64(<1 x i64> %a) #0 {
+define i64 @umaxv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: umaxv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a)
ret i64 %res
}
; No NEON 64-bit vector UMAXV support. Use SVE.
-define i64 @umaxv_v2i64(<2 x i64> %a) #0 {
+define i64 @umaxv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: umaxv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: umaxv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
ret i64 %res
}
-define i64 @umaxv_v4i64(<4 x i64>* %a) #0 {
+define i64 @umaxv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: umaxv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: umaxv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
ret i64 %res
}
define i64 @umaxv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: umaxv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[MAX]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umaxv_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: umax z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_256-NEXT: umaxv d0, p0, z0.d
+; VBITS_GE_256-NEXT: fmov x0, d0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: umaxv_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: umaxv d0, p0, z0.d
+; VBITS_GE_512-NEXT: fmov x0, d0
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %op)
ret i64 %res
}
-define i64 @umaxv_v16i64(<16 x i64>* %a) #0 {
+define i64 @umaxv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: umaxv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: umaxv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %op)
ret i64 %res
}
-define i64 @umaxv_v32i64(<32 x i64>* %a) #0 {
+define i64 @umaxv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: umaxv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: umaxv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %op)
ret i64 %res
@@ -1263,306 +1407,342 @@ define i64 @umaxv_v32i64(<32 x i64>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define i8 @uminv_v8i8(<8 x i8> %a) #0 {
+define i8 @uminv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uminv_v8i8:
-; CHECK: uminv b0, v0.8b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: uminv b0, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
ret i8 %res
}
; Don't use SVE for 128-bit vectors.
-define i8 @uminv_v16i8(<16 x i8> %a) #0 {
+define i8 @uminv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uminv_v16i8:
-; CHECK: uminv b0, v0.16b
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: uminv b0, v0.16b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
ret i8 %res
}
-define i8 @uminv_v32i8(<32 x i8>* %a) #0 {
+define i8 @uminv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uminv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: uminv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
ret i8 %res
}
define i8 @uminv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: uminv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
-; VBITS_EQ_256-DAG: uminv b[[REDUCE:[0-9]+]], [[PG]], [[MIN]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uminv_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: umin z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT: uminv b0, p0, z0.b
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: uminv_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: uminv b0, p0, z0.b
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %op)
ret i8 %res
}
-define i8 @uminv_v128i8(<128 x i8>* %a) #0 {
+define i8 @uminv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: uminv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: uminv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %op)
ret i8 %res
}
-define i8 @uminv_v256i8(<256 x i8>* %a) #0 {
+define i8 @uminv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: uminv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: uminv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %op)
ret i8 %res
}
; Don't use SVE for 64-bit vectors.
-define i16 @uminv_v4i16(<4 x i16> %a) #0 {
+define i16 @uminv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uminv_v4i16:
-; CHECK: uminv h0, v0.4h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: uminv h0, v0.4h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
ret i16 %res
}
; Don't use SVE for 128-bit vectors.
-define i16 @uminv_v8i16(<8 x i16> %a) #0 {
+define i16 @uminv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uminv_v8i16:
-; CHECK: uminv h0, v0.8h
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: uminv h0, v0.8h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
ret i16 %res
}
-define i16 @uminv_v16i16(<16 x i16>* %a) #0 {
+define i16 @uminv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uminv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: uminv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
ret i16 %res
}
define i16 @uminv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: uminv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: uminv h[[REDUCE:[0-9]+]], [[PG]], [[MIN]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uminv_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: umin z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT: uminv h0, p0, z0.h
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: uminv_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: uminv h0, p0, z0.h
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %op)
ret i16 %res
}
-define i16 @uminv_v64i16(<64 x i16>* %a) #0 {
+define i16 @uminv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: uminv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: uminv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %op)
ret i16 %res
}
-define i16 @uminv_v128i16(<128 x i16>* %a) #0 {
+define i16 @uminv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: uminv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: uminv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %op)
ret i16 %res
}
; Don't use SVE for 64-bit vectors.
-define i32 @uminv_v2i32(<2 x i32> %a) #0 {
+define i32 @uminv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uminv_v2i32:
-; CHECK: minp v0.2s, v0.2s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
ret i32 %res
}
; Don't use SVE for 128-bit vectors.
-define i32 @uminv_v4i32(<4 x i32> %a) #0 {
+define i32 @uminv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uminv_v4i32:
-; CHECK: uminv s0, v0.4s
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: uminv s0, v0.4s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
ret i32 %res
}
-define i32 @uminv_v8i32(<8 x i32>* %a) #0 {
+define i32 @uminv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uminv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: uminv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
ret i32 %res
}
define i32 @uminv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: uminv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: uminv [[REDUCE:s[0-9]+]], [[PG]], [[MIN]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uminv_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: umin z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: uminv s0, p0, z0.s
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: uminv_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: uminv s0, p0, z0.s
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %op)
ret i32 %res
}
-define i32 @uminv_v32i32(<32 x i32>* %a) #0 {
+define i32 @uminv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: uminv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: uminv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %op)
ret i32 %res
}
-define i32 @uminv_v64i32(<64 x i32>* %a) #0 {
+define i32 @uminv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: uminv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: uminv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %op)
ret i32 %res
}
; Nothing to do for single element vectors.
-define i64 @uminv_v1i64(<1 x i64> %a) #0 {
+define i64 @uminv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uminv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %a)
ret i64 %res
}
; No NEON 64-bit vector UMINV support. Use SVE.
-define i64 @uminv_v2i64(<2 x i64> %a) #0 {
+define i64 @uminv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uminv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: uminv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
ret i64 %res
}
-define i64 @uminv_v4i64(<4 x i64>* %a) #0 {
+define i64 @uminv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: uminv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: uminv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
ret i64 %res
}
define i64 @uminv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: uminv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: uminv [[REDUCE:d[0-9]+]], [[PG]], [[MIN]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uminv_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: umin z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_256-NEXT: uminv d0, p0, z0.d
+; VBITS_GE_256-NEXT: fmov x0, d0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: uminv_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: uminv d0, p0, z0.d
+; VBITS_GE_512-NEXT: fmov x0, d0
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %op)
ret i64 %res
}
-define i64 @uminv_v16i64(<16 x i64>* %a) #0 {
+define i64 @uminv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: uminv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: uminv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %op)
ret i64 %res
}
-define i64 @uminv_v32i64(<32 x i64>* %a) #0 {
+define i64 @uminv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: uminv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: uminv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %op)
ret i64 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 1a7774bd1174..e0dea9c6c962 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -1,19 +1,8 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=VBITS_EQ_128
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048,VBITS_EQ_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@@ -24,172 +13,171 @@ target triple = "aarch64-unknown-linux-gnu"
; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
-; CHECK-LABEL: srem_v8i8:
-; CHECK: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2:z[0-9]+]].b
-; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1:z[0-9]+]].b
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; CHECK-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; CHECK-NEXT: umov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].h[0]
-; CHECK-NEXT: umov [[SCALAR2:w[0-9]+]], [[VEC]].h[1]
-; CHECK-NEXT: fmov s3, [[SCALAR1]]
-; CHECK-NEXT: umov [[SCALAR3:w[0-9]+]], [[VEC]].h[2]
-; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR2]]
-; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR3]]
-; CHECK-NEXT: umov [[SCALAR4:w[0-9]+]], [[VEC]].h[3]
-; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR4]]
-; CHECK-NEXT: umov [[SCALAR5:w[0-9]+]], [[VEC]].h[4]
-; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR5]]
-; CHECK-NEXT: umov [[SCALAR6:w[0-9]+]], [[VEC]].h[5]
-; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR6]]
-; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[6]
-; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR7]]
-; CHECK-NEXT: umov [[SCALAR8:w[0-9]+]], [[VEC]].h[7]
-; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR8]]
-; CHECK-NEXT: mls v0.8b, [[FINAL]].8b, v1.8b
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: srem_v8i8:
-; VBITS_EQ_128: sshll v2.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: sshll v3.8h, v0.8b, #0
-; VBITS_EQ_128-NEXT: sunpkhi z4.s, z2.h
-; VBITS_EQ_128-NEXT: sunpklo z2.s, z2.h
-; VBITS_EQ_128-NEXT: sunpkhi z5.s, z3.h
-; VBITS_EQ_128-NEXT: sunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: uzp1 z2.h, z2.h, z4.h
-; VBITS_EQ_128-NEXT: xtn v2.8b, v2.8h
-; VBITS_EQ_128-NEXT: mls v0.8b, v2.8b, v1.8b
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: srem_v8i8:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: sshll v2.8h, v1.8b, #0
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: sshll v3.8h, v0.8b, #0
+; VBITS_GE_128-NEXT: sunpkhi z4.s, z2.h
+; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT: sunpkhi z5.s, z3.h
+; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z4.h
+; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h
+; VBITS_GE_128-NEXT: mls v0.8b, v2.8b, v1.8b
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: srem_v8i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_256-NEXT: sunpklo z2.h, z1.b
+; VBITS_GE_256-NEXT: sunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
+; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT: umov w8, v2.h[0]
+; VBITS_GE_256-NEXT: umov w9, v2.h[1]
+; VBITS_GE_256-NEXT: fmov s3, w8
+; VBITS_GE_256-NEXT: umov w8, v2.h[2]
+; VBITS_GE_256-NEXT: mov v3.b[1], w9
+; VBITS_GE_256-NEXT: mov v3.b[2], w8
+; VBITS_GE_256-NEXT: umov w8, v2.h[3]
+; VBITS_GE_256-NEXT: mov v3.b[3], w8
+; VBITS_GE_256-NEXT: umov w8, v2.h[4]
+; VBITS_GE_256-NEXT: mov v3.b[4], w8
+; VBITS_GE_256-NEXT: umov w8, v2.h[5]
+; VBITS_GE_256-NEXT: mov v3.b[5], w8
+; VBITS_GE_256-NEXT: umov w8, v2.h[6]
+; VBITS_GE_256-NEXT: mov v3.b[6], w8
+; VBITS_GE_256-NEXT: umov w8, v2.h[7]
+; VBITS_GE_256-NEXT: mov v3.b[7], w8
+; VBITS_GE_256-NEXT: mls v0.8b, v3.8b, v1.8b
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: srem_v8i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT: sunpklo z2.h, z1.b
+; VBITS_GE_512-NEXT: sunpklo z3.h, z0.b
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h
+; VBITS_GE_512-NEXT: sunpklo z3.s, z3.h
+; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT: umov w8, v2.h[0]
+; VBITS_GE_512-NEXT: umov w9, v2.h[1]
+; VBITS_GE_512-NEXT: fmov s3, w8
+; VBITS_GE_512-NEXT: umov w8, v2.h[2]
+; VBITS_GE_512-NEXT: mov v3.b[1], w9
+; VBITS_GE_512-NEXT: mov v3.b[2], w8
+; VBITS_GE_512-NEXT: umov w8, v2.h[3]
+; VBITS_GE_512-NEXT: mov v3.b[3], w8
+; VBITS_GE_512-NEXT: umov w8, v2.h[4]
+; VBITS_GE_512-NEXT: mov v3.b[4], w8
+; VBITS_GE_512-NEXT: umov w8, v2.h[5]
+; VBITS_GE_512-NEXT: mov v3.b[5], w8
+; VBITS_GE_512-NEXT: umov w8, v2.h[6]
+; VBITS_GE_512-NEXT: mov v3.b[6], w8
+; VBITS_GE_512-NEXT: umov w8, v2.h[7]
+; VBITS_GE_512-NEXT: mov v3.b[7], w8
+; VBITS_GE_512-NEXT: mls v0.8b, v3.8b, v1.8b
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_512-NEXT: ret
%res = srem <8 x i8> %op1, %op2
ret <8 x i8> %res
}
define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
-; CHECK-LABEL: srem_v16i8:
-
-; HALF VECTOR
-; VBITS_EQ_256: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_256-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_256-NEXT: mls v0.16b, v2.16b, v1.16b
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_512: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: srem_v16i8:
-; VBITS_EQ_128: sunpkhi z2.h, z1.b
-; VBITS_EQ_128-NEXT: sunpkhi z3.h, z0.b
-; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: sunpkhi z5.s, z2.h
-; VBITS_EQ_128-NEXT: sunpkhi z6.s, z3.h
-; VBITS_EQ_128-NEXT: sunpklo z2.s, z2.h
-; VBITS_EQ_128-NEXT: sunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT: sunpklo z4.h, z1.b
-; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: sunpklo z3.h, z0.b
-; VBITS_EQ_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
-; VBITS_EQ_128-NEXT: sunpkhi z6.s, z4.h
-; VBITS_EQ_128-NEXT: sunpkhi z7.s, z3.h
-; VBITS_EQ_128-NEXT: sunpklo z4.s, z4.h
-; VBITS_EQ_128-NEXT: sunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
-; VBITS_EQ_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
-; VBITS_EQ_128-NEXT: uzp1 z2.h, z2.h, z5.h
-; VBITS_EQ_128-NEXT: uzp1 z3.h, z3.h, z6.h
-; VBITS_EQ_128-NEXT: uzp1 z2.b, z3.b, z2.b
-; VBITS_EQ_128-NEXT: mls v0.16b, v2.16b, v1.16b
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: srem_v16i8:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT: sunpkhi z2.h, z1.b
+; VBITS_GE_128-NEXT: sunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: sunpkhi z5.s, z2.h
+; VBITS_GE_128-NEXT: sunpkhi z6.s, z3.h
+; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT: sunpklo z4.h, z1.b
+; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: sunpklo z3.h, z0.b
+; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT: sunpkhi z6.s, z4.h
+; VBITS_GE_128-NEXT: sunpkhi z7.s, z3.h
+; VBITS_GE_128-NEXT: sunpklo z4.s, z4.h
+; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
+; VBITS_GE_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
+; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z5.h
+; VBITS_GE_128-NEXT: uzp1 z3.h, z3.h, z6.h
+; VBITS_GE_128-NEXT: uzp1 z2.b, z3.b, z2.b
+; VBITS_GE_128-NEXT: mls v0.16b, v2.16b, v1.16b
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: srem_v16i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT: sunpklo z2.h, z1.b
+; VBITS_GE_256-NEXT: sunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: sunpkhi z4.s, z2.h
+; VBITS_GE_256-NEXT: sunpkhi z5.s, z3.h
+; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
+; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z4.h
+; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT: mls v0.16b, v2.16b, v1.16b
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: srem_v16i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT: sunpklo z2.h, z1.b
+; VBITS_GE_512-NEXT: sunpklo z3.h, z0.b
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h
+; VBITS_GE_512-NEXT: sunpklo z3.s, z3.h
+; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT: uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT: ret
%res = srem <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: srem_v32i8:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_256-NEXT: sdivr [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_256-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b
-; VBITS_EQ_256-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_512-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_EQ_512-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: sunpklo z2.h, z1.b
+; CHECK-NEXT: sunpklo z3.h, z0.b
+; CHECK-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEXT: sunpklo z3.s, z3.h
+; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = srem <32 x i8> %op1, %op2
@@ -197,69 +185,23 @@ define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
ret void
}
-define void @srem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
+define void @srem_v64i8(<64 x i8>* %a, <64 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v64i8:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_512-NEXT: sdivr [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b
-; VBITS_EQ_512-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_1024-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_EQ_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_2048-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: sunpklo z2.h, z1.b
+; CHECK-NEXT: sunpklo z3.h, z0.b
+; CHECK-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEXT: sunpklo z3.s, z3.h
+; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = srem <64 x i8> %op1, %op2
@@ -267,54 +209,26 @@ define void @srem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @srem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @srem_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v128i8:
-
-; FULL VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_1024-NEXT: sdivr [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b
-; VBITS_EQ_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_2048-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_2048-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_EQ_2048-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: sunpklo z2.h, z1.b
+; CHECK-NEXT: sunpklo z3.h, z0.b
+; CHECK-NEXT: sunpkhi z4.s, z2.h
+; CHECK-NEXT: sunpkhi z5.s, z3.h
+; CHECK-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEXT: sunpklo z3.s, z3.h
+; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h
+; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = srem <128 x i8> %op1, %op2
@@ -322,36 +236,35 @@ define void @srem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @srem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @srem_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v256i8:
-
-; FULL VECTOR:
-; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: sdivr [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sdivr [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sdiv [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_2048-NEXT: sdivr [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[ZIP:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b
-; VBITS_EQ_2048-NEXT: mls [[OP1]].b, [[PG]]/m, [[ZIP]].b, [[OP2]].b
-; VBITS_EQ_2048-NEXT: st1b { [[OP1]].b }, [[PG]], [x0]
-; VBITS_EQ_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: sunpkhi z2.h, z1.b
+; CHECK-NEXT: sunpkhi z3.h, z0.b
+; CHECK-NEXT: sunpklo z4.h, z1.b
+; CHECK-NEXT: sunpklo z5.h, z0.b
+; CHECK-NEXT: sunpkhi z6.s, z2.h
+; CHECK-NEXT: sunpkhi z7.s, z3.h
+; CHECK-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEXT: sunpklo z3.s, z3.h
+; CHECK-NEXT: sdivr z6.s, p1/m, z6.s, z7.s
+; CHECK-NEXT: sunpkhi z7.s, z4.h
+; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: sunpkhi z3.s, z5.h
+; CHECK-NEXT: sunpklo z4.s, z4.h
+; CHECK-NEXT: sunpklo z5.s, z5.h
+; CHECK-NEXT: sdiv z3.s, p1/m, z3.s, z7.s
+; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h
+; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h
+; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
+; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = srem <256 x i8> %op1, %op2
@@ -362,93 +275,154 @@ define void @srem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
-; CHECK-LABEL: srem_v4i16:
-; CHECK: sshll v2.4s, v1.4h, #0
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
-; CHECK-NEXT: sshll v3.4s, v0.4h, #0
-; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, z2.s, z3.s
-; CHECK-NEXT: mov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].s[1]
-; CHECK-NEXT: mov [[SCALAR2:w[0-9]+]], [[VEC]].s[2]
-; CHECK-NEXT: mov [[VEC2:v[0-9]+]].16b, [[VEC]].16b
-; CHECK-NEXT: mov [[VEC2]].h[1], [[SCALAR1]]
-; CHECK-NEXT: mov [[SCALAR3:w[0-9]+]], [[VEC]].s[3]
-; CHECK-NEXT: mov [[VEC2]].h[2], [[SCALAR2]]
-; CHECK-NEXT: mov [[VEC2]].h[3], [[SCALAR3]]
-; CHECK-NEXT: mls v0.4h, [[VEC2]].4h, v1.4h
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: srem_v4i16:
-; VBITS_EQ_128: sshll v2.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: sshll v3.4s, v0.4h, #0
-; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: xtn v2.4h, v2.4s
-; VBITS_EQ_128-NEXT: mls v0.4h, v2.4h, v1.4h
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: srem_v4i16:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: sshll v2.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: sshll v3.4s, v0.4h, #0
+; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: xtn v2.4h, v2.4s
+; VBITS_GE_128-NEXT: mls v0.4h, v2.4h, v1.4h
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: srem_v4i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: sshll v2.4s, v1.4h, #0
+; VBITS_GE_256-NEXT: ptrue p0.s, vl4
+; VBITS_GE_256-NEXT: sshll v3.4s, v0.4h, #0
+; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: mov w8, v2.s[1]
+; VBITS_GE_256-NEXT: mov w9, v2.s[2]
+; VBITS_GE_256-NEXT: mov v3.16b, v2.16b
+; VBITS_GE_256-NEXT: mov v3.h[1], w8
+; VBITS_GE_256-NEXT: mov w8, v2.s[3]
+; VBITS_GE_256-NEXT: mov v3.h[2], w9
+; VBITS_GE_256-NEXT: mov v3.h[3], w8
+; VBITS_GE_256-NEXT: mls v0.4h, v3.4h, v1.4h
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: srem_v4i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: sshll v2.4s, v1.4h, #0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl4
+; VBITS_GE_512-NEXT: sshll v3.4s, v0.4h, #0
+; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT: mov w8, v2.s[1]
+; VBITS_GE_512-NEXT: mov w9, v2.s[2]
+; VBITS_GE_512-NEXT: mov v3.16b, v2.16b
+; VBITS_GE_512-NEXT: mov v3.h[1], w8
+; VBITS_GE_512-NEXT: mov w8, v2.s[3]
+; VBITS_GE_512-NEXT: mov v3.h[2], w9
+; VBITS_GE_512-NEXT: mov v3.h[3], w8
+; VBITS_GE_512-NEXT: mls v0.4h, v3.4h, v1.4h
+; VBITS_GE_512-NEXT: ret
%res = srem <4 x i16> %op1, %op2
ret <4 x i16> %res
}
define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
-; CHECK-LABEL: srem_v8i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: srem_v8i16:
-; VBITS_EQ_128: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: sunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT: sunpkhi z3.s, z0.h
-; VBITS_EQ_128-NEXT: sunpklo z4.s, z1.h
-; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: sunpklo z5.s, z0.h
-; VBITS_EQ_128-NEXT: movprfx z3, z5
-; VBITS_EQ_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
-; VBITS_EQ_128-NEXT: uzp1 z2.h, z3.h, z2.h
-; VBITS_EQ_128-NEXT: mls v0.8h, v2.8h, v1.8h
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: srem_v8i16:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h
+; VBITS_GE_128-NEXT: sunpkhi z3.s, z0.h
+; VBITS_GE_128-NEXT: sunpklo z4.s, z1.h
+; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: sunpklo z5.s, z0.h
+; VBITS_GE_128-NEXT: movprfx z3, z5
+; VBITS_GE_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
+; VBITS_GE_128-NEXT: uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: srem_v8i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT: mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: srem_v8i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h
+; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h
+; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT: mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT: ret
%res = srem <8 x i16> %op1, %op2
ret <8 x i16> %res
}
define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; CHECK-LABEL: srem_v16i16:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_256-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_EQ_256-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_512-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-; CHECK: ret
-
+; VBITS_GE_128-LABEL: srem_v16i16:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ldp q2, q0, [x0]
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: sunpkhi z17.s, z2.h
+; VBITS_GE_128-NEXT: ldp q3, q1, [x1]
+; VBITS_GE_128-NEXT: sunpkhi z5.s, z0.h
+; VBITS_GE_128-NEXT: sunpklo z7.s, z0.h
+; VBITS_GE_128-NEXT: sunpkhi z16.s, z3.h
+; VBITS_GE_128-NEXT: sdivr z16.s, p0/m, z16.s, z17.s
+; VBITS_GE_128-NEXT: sunpkhi z4.s, z1.h
+; VBITS_GE_128-NEXT: sunpklo z6.s, z1.h
+; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT: sunpklo z5.s, z3.h
+; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
+; VBITS_GE_128-NEXT: sunpklo z7.s, z2.h
+; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z7.s
+; VBITS_GE_128-NEXT: uzp1 z4.h, z6.h, z4.h
+; VBITS_GE_128-NEXT: uzp1 z5.h, z5.h, z16.h
+; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v3.8h
+; VBITS_GE_128-NEXT: mls v0.8h, v4.8h, v1.8h
+; VBITS_GE_128-NEXT: stp q2, q0, [x0]
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: srem_v16i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ptrue p1.s, vl8
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h
+; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: sunpklo z4.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z5.s, z0.h
+; VBITS_GE_256-NEXT: movprfx z3, z5
+; VBITS_GE_256-NEXT: sdiv z3.s, p1/m, z3.s, z4.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: srem_v16i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: ptrue p1.s, vl16
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h
+; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h
+; VBITS_GE_512-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = srem <16 x i16> %op1, %op2
@@ -456,37 +430,20 @@ define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
ret void
}
-define void @srem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+define void @srem_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: srem_v32i16:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_512-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_EQ_512-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: sunpklo z2.s, z1.h
+; CHECK-NEXT: sunpklo z3.s, z0.h
+; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = srem <32 x i16> %op1, %op2
@@ -494,35 +451,20 @@ define void @srem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @srem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @srem_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v64i16:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_1024-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_EQ_1024-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: sunpklo z2.s, z1.h
+; CHECK-NEXT: sunpklo z3.s, z0.h
+; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = srem <64 x i16> %op1, %op2
@@ -530,23 +472,24 @@ define void @srem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @srem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @srem_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v128i16:
-; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: sdivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: movprfx [[OP3_LO:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_2048-NEXT: sdiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP3_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[ZIP:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
-; VBITS_EQ_2048-NEXT: mls [[OP1]].h, [[PG]]/m, [[ZIP]].h, [[OP2]].h
-; VBITS_EQ_2048-NEXT: st1h { [[OP1]].h }, [[PG]], [x0]
-; VBITS_EQ_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: sunpkhi z2.s, z1.h
+; CHECK-NEXT: sunpkhi z3.s, z0.h
+; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: sunpklo z4.s, z1.h
+; CHECK-NEXT: sunpklo z5.s, z0.h
+; CHECK-NEXT: movprfx z3, z5
+; CHECK-NEXT: sdiv z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = srem <128 x i16> %op1, %op2
@@ -555,55 +498,48 @@ define void @srem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Vector v2i32 sdiv are not legal for NEON so use SVE when available.
-define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: srem_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], z0
-; CHECK-NEXT: sdiv z2.s, [[PG]]/m, [[PFX]].s, z1.s
-; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: srem_v2i32:
-; VBITS_EQ_128: ptrue p0.s, vl2
-; VBITS_EQ_128-NEXT: movprfx z2, z0
-; VBITS_EQ_128-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
-; VBITS_EQ_128-NEXT: mls v0.2s, v2.2s, v1.2s
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = srem <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Vector v4i32 sdiv are not legal for NEON so use SVE when available.
-define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: srem_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], z0
-; CHECK-NEXT: sdiv z2.s, [[PG]]/m, [[PFX]].s, z1.s
-; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
-; CHECK-NEXT: ret
-
-; VBITS_EQ_128-LABEL: srem_v4i32:
-; VBITS_EQ_128: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: movprfx z2, z0
-; VBITS_EQ_128-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
-; VBITS_EQ_128-NEXT: mls v0.4s, v2.4s, v1.4s
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = srem <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @srem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @srem_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: srem_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; CHECK-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = srem <8 x i32> %op1, %op2
@@ -612,15 +548,57 @@ define void @srem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @srem_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: srem_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_512-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: srem_v16i32:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32]
+; VBITS_GE_128-NEXT: ldp q7, q6, [x1]
+; VBITS_GE_128-NEXT: movprfx z16, z0
+; VBITS_GE_128-NEXT: sdiv z16.s, p0/m, z16.s, z4.s
+; VBITS_GE_128-NEXT: mls v0.4s, v16.4s, v4.4s
+; VBITS_GE_128-NEXT: movprfx z4, z3
+; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z6.s
+; VBITS_GE_128-NEXT: movprfx z16, z1
+; VBITS_GE_128-NEXT: sdiv z16.s, p0/m, z16.s, z5.s
+; VBITS_GE_128-NEXT: mls v1.4s, v16.4s, v5.4s
+; VBITS_GE_128-NEXT: movprfx z5, z2
+; VBITS_GE_128-NEXT: sdiv z5.s, p0/m, z5.s, z7.s
+; VBITS_GE_128-NEXT: mls v2.4s, v5.4s, v7.4s
+; VBITS_GE_128-NEXT: mls v3.4s, v4.4s, v6.4s
+; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: stp q2, q3, [x0]
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: srem_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: movprfx z4, z0
+; VBITS_GE_256-NEXT: sdiv z4.s, p0/m, z4.s, z2.s
+; VBITS_GE_256-NEXT: movprfx z5, z1
+; VBITS_GE_256-NEXT: sdiv z5.s, p0/m, z5.s, z3.s
+; VBITS_GE_256-NEXT: mls z0.s, p0/m, z4.s, z2.s
+; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: srem_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: movprfx z2, z0
+; VBITS_GE_512-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
+; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = srem <16 x i32> %op1, %op2
@@ -628,16 +606,17 @@ define void @srem_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @srem_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @srem_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: srem_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_1024-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = srem <32 x i32> %op1, %op2
@@ -645,16 +624,17 @@ define void @srem_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @srem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @srem_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_2048-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = srem <64 x i32> %op1, %op2
@@ -664,60 +644,49 @@ define void @srem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
; Vector i64 sdiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for the 128 bits case here.
-define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: srem_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, z1.d
-; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d
-; CHECK-NEXT: sub d0, d0, d1
-; CHECK-NEXT: ret
-
-; VBITS_EQ_128-LABEL: srem_v1i64:
-; VBITS_EQ_128: ptrue p0.d, vl1
-; VBITS_EQ_128-NEXT: movprfx z2, z0
-; VBITS_EQ_128-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d
-; VBITS_EQ_128-NEXT: sub d0, d0, d1
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: sub d0, d0, d1
+; CHECK-NEXT: ret
%res = srem <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Vector i64 sdiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for the 128 bits case here.
-define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: srem_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, z1.d
-; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d
-; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
-
-; VBITS_EQ_128-LABEL: srem_v2i64:
-; VBITS_EQ_128: ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT: movprfx z2, z0
-; VBITS_EQ_128-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d
-; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = srem <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @srem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @srem_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: srem_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; CHECK-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = srem <4 x i64> %op1, %op2
@@ -726,15 +695,61 @@ define void @srem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @srem_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: srem_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_512-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: srem_v8i64:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ldp q4, q5, [x1]
+; VBITS_GE_128-NEXT: ptrue p0.d, vl2
+; VBITS_GE_128-NEXT: ldp q7, q6, [x1, #32]
+; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT: movprfx z16, z3
+; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z5.d
+; VBITS_GE_128-NEXT: movprfx z17, z2
+; VBITS_GE_128-NEXT: sdiv z17.d, p0/m, z17.d, z4.d
+; VBITS_GE_128-NEXT: mul z5.d, p0/m, z5.d, z16.d
+; VBITS_GE_128-NEXT: movprfx z16, z1
+; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z6.d
+; VBITS_GE_128-NEXT: mul z4.d, p0/m, z4.d, z17.d
+; VBITS_GE_128-NEXT: movprfx z17, z0
+; VBITS_GE_128-NEXT: sdiv z17.d, p0/m, z17.d, z7.d
+; VBITS_GE_128-NEXT: mul z6.d, p0/m, z6.d, z16.d
+; VBITS_GE_128-NEXT: mul z7.d, p0/m, z7.d, z17.d
+; VBITS_GE_128-NEXT: sub v0.2d, v0.2d, v7.2d
+; VBITS_GE_128-NEXT: sub v1.2d, v1.2d, v6.2d
+; VBITS_GE_128-NEXT: sub v2.2d, v2.2d, v4.2d
+; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: sub v0.2d, v3.2d, v5.2d
+; VBITS_GE_128-NEXT: stp q2, q0, [x0]
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: srem_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: movprfx z4, z0
+; VBITS_GE_256-NEXT: sdiv z4.d, p0/m, z4.d, z2.d
+; VBITS_GE_256-NEXT: movprfx z5, z1
+; VBITS_GE_256-NEXT: sdiv z5.d, p0/m, z5.d, z3.d
+; VBITS_GE_256-NEXT: mls z0.d, p0/m, z4.d, z2.d
+; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: srem_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: movprfx z2, z0
+; VBITS_GE_512-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
+; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = srem <8 x i64> %op1, %op2
@@ -742,16 +757,17 @@ define void @srem_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @srem_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @srem_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: srem_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_1024-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = srem <16 x i64> %op1, %op2
@@ -759,16 +775,17 @@ define void @srem_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @srem_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @srem_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_2048-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = srem <32 x i64> %op1, %op2
@@ -783,172 +800,171 @@ define void @srem_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
; Vector vXi8 udiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
-; CHECK-LABEL: urem_v8i8:
-; CHECK: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2:z[0-9]+]].b
-; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1:z[0-9]+]].b
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; CHECK-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; CHECK-NEXT: umov [[SCALAR0:w[0-9]+]], [[VEC:v[0-9]+]].h[0]
-; CHECK-NEXT: umov [[SCALAR1:w[0-9]+]], [[VEC]].h[1]
-; CHECK-NEXT: fmov s3, [[SCALAR0]]
-; CHECK-NEXT: umov [[SCALAR2:w[0-9]+]], [[VEC]].h[2]
-; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR1]]
-; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR2]]
-; CHECK-NEXT: umov [[SCALAR3:w[0-9]+]], [[VEC]].h[3]
-; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR3]]
-; CHECK-NEXT: umov [[SCALAR4:w[0-9]+]], [[VEC]].h[4]
-; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR4]]
-; CHECK-NEXT: umov [[SCALAR5:w[0-9]+]], [[VEC]].h[5]
-; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR5]]
-; CHECK-NEXT: umov [[SCALAR6:w[0-9]+]], [[VEC]].h[6]
-; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR6]]
-; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7]
-; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]]
-; CHECK-NEXT: mls v0.8b, [[FINAL]].8b, v1.8b
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: urem_v8i8:
-; VBITS_EQ_128: ushll v2.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: ushll v3.8h, v0.8b, #0
-; VBITS_EQ_128-NEXT: uunpkhi z4.s, z2.h
-; VBITS_EQ_128-NEXT: uunpklo z2.s, z2.h
-; VBITS_EQ_128-NEXT: uunpkhi z5.s, z3.h
-; VBITS_EQ_128-NEXT: uunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
-; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: uzp1 z2.h, z2.h, z4.h
-; VBITS_EQ_128-NEXT: xtn v2.8b, v2.8h
-; VBITS_EQ_128-NEXT: mls v0.8b, v2.8b, v1.8b
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: urem_v8i8:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ushll v2.8h, v1.8b, #0
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: ushll v3.8h, v0.8b, #0
+; VBITS_GE_128-NEXT: uunpkhi z4.s, z2.h
+; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT: uunpkhi z5.s, z3.h
+; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z4.h
+; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h
+; VBITS_GE_128-NEXT: mls v0.8b, v2.8b, v1.8b
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: urem_v8i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_256-NEXT: uunpklo z2.h, z1.b
+; VBITS_GE_256-NEXT: uunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
+; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT: umov w8, v2.h[0]
+; VBITS_GE_256-NEXT: umov w9, v2.h[1]
+; VBITS_GE_256-NEXT: fmov s3, w8
+; VBITS_GE_256-NEXT: umov w8, v2.h[2]
+; VBITS_GE_256-NEXT: mov v3.b[1], w9
+; VBITS_GE_256-NEXT: mov v3.b[2], w8
+; VBITS_GE_256-NEXT: umov w8, v2.h[3]
+; VBITS_GE_256-NEXT: mov v3.b[3], w8
+; VBITS_GE_256-NEXT: umov w8, v2.h[4]
+; VBITS_GE_256-NEXT: mov v3.b[4], w8
+; VBITS_GE_256-NEXT: umov w8, v2.h[5]
+; VBITS_GE_256-NEXT: mov v3.b[5], w8
+; VBITS_GE_256-NEXT: umov w8, v2.h[6]
+; VBITS_GE_256-NEXT: mov v3.b[6], w8
+; VBITS_GE_256-NEXT: umov w8, v2.h[7]
+; VBITS_GE_256-NEXT: mov v3.b[7], w8
+; VBITS_GE_256-NEXT: mls v0.8b, v3.8b, v1.8b
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: urem_v8i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT: uunpklo z2.h, z1.b
+; VBITS_GE_512-NEXT: uunpklo z3.h, z0.b
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h
+; VBITS_GE_512-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT: umov w8, v2.h[0]
+; VBITS_GE_512-NEXT: umov w9, v2.h[1]
+; VBITS_GE_512-NEXT: fmov s3, w8
+; VBITS_GE_512-NEXT: umov w8, v2.h[2]
+; VBITS_GE_512-NEXT: mov v3.b[1], w9
+; VBITS_GE_512-NEXT: mov v3.b[2], w8
+; VBITS_GE_512-NEXT: umov w8, v2.h[3]
+; VBITS_GE_512-NEXT: mov v3.b[3], w8
+; VBITS_GE_512-NEXT: umov w8, v2.h[4]
+; VBITS_GE_512-NEXT: mov v3.b[4], w8
+; VBITS_GE_512-NEXT: umov w8, v2.h[5]
+; VBITS_GE_512-NEXT: mov v3.b[5], w8
+; VBITS_GE_512-NEXT: umov w8, v2.h[6]
+; VBITS_GE_512-NEXT: mov v3.b[6], w8
+; VBITS_GE_512-NEXT: umov w8, v2.h[7]
+; VBITS_GE_512-NEXT: mov v3.b[7], w8
+; VBITS_GE_512-NEXT: mls v0.8b, v3.8b, v1.8b
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_512-NEXT: ret
%res = urem <8 x i8> %op1, %op2
ret <8 x i8> %res
}
define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
-; CHECK-LABEL: urem_v16i8:
-
-; HALF VECTOR
-; VBITS_EQ_256: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_256-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_256-NEXT: mls v0.16b, v2.16b, v1.16b
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_512: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: urem_v16i8:
-; VBITS_EQ_128: uunpkhi z2.h, z1.b
-; VBITS_EQ_128-NEXT: uunpkhi z3.h, z0.b
-; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: uunpkhi z5.s, z2.h
-; VBITS_EQ_128-NEXT: uunpkhi z6.s, z3.h
-; VBITS_EQ_128-NEXT: uunpklo z2.s, z2.h
-; VBITS_EQ_128-NEXT: uunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT: uunpklo z4.h, z1.b
-; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: uunpklo z3.h, z0.b
-; VBITS_EQ_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s
-; VBITS_EQ_128-NEXT: uunpkhi z6.s, z4.h
-; VBITS_EQ_128-NEXT: uunpkhi z7.s, z3.h
-; VBITS_EQ_128-NEXT: uunpklo z4.s, z4.h
-; VBITS_EQ_128-NEXT: uunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s
-; VBITS_EQ_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s
-; VBITS_EQ_128-NEXT: uzp1 z2.h, z2.h, z5.h
-; VBITS_EQ_128-NEXT: uzp1 z3.h, z3.h, z6.h
-; VBITS_EQ_128-NEXT: uzp1 z2.b, z3.b, z2.b
-; VBITS_EQ_128-NEXT: mls v0.16b, v2.16b, v1.16b
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: urem_v16i8:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT: uunpkhi z2.h, z1.b
+; VBITS_GE_128-NEXT: uunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: uunpkhi z5.s, z2.h
+; VBITS_GE_128-NEXT: uunpkhi z6.s, z3.h
+; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT: uunpklo z4.h, z1.b
+; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: uunpklo z3.h, z0.b
+; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT: uunpkhi z6.s, z4.h
+; VBITS_GE_128-NEXT: uunpkhi z7.s, z3.h
+; VBITS_GE_128-NEXT: uunpklo z4.s, z4.h
+; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s
+; VBITS_GE_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s
+; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z5.h
+; VBITS_GE_128-NEXT: uzp1 z3.h, z3.h, z6.h
+; VBITS_GE_128-NEXT: uzp1 z2.b, z3.b, z2.b
+; VBITS_GE_128-NEXT: mls v0.16b, v2.16b, v1.16b
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: urem_v16i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT: uunpklo z2.h, z1.b
+; VBITS_GE_256-NEXT: uunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: uunpkhi z4.s, z2.h
+; VBITS_GE_256-NEXT: uunpkhi z5.s, z3.h
+; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
+; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z4.h
+; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT: mls v0.16b, v2.16b, v1.16b
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: urem_v16i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT: uunpklo z2.h, z1.b
+; VBITS_GE_512-NEXT: uunpklo z3.h, z0.b
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h
+; VBITS_GE_512-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT: uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT: ret
%res = urem <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: urem_v32i8:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_256-NEXT: udivr [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_256-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b
-; VBITS_EQ_256-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_512-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_EQ_512-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: uunpklo z2.h, z1.b
+; CHECK-NEXT: uunpklo z3.h, z0.b
+; CHECK-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEXT: uunpklo z3.s, z3.h
+; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = urem <32 x i8> %op1, %op2
@@ -956,69 +972,23 @@ define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
ret void
}
-define void @urem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
+define void @urem_v64i8(<64 x i8>* %a, <64 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v64i8:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_512-NEXT: udivr [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b
-; VBITS_EQ_512-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_1024-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_EQ_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_2048-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: uunpklo z2.h, z1.b
+; CHECK-NEXT: uunpklo z3.h, z0.b
+; CHECK-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEXT: uunpklo z3.s, z3.h
+; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = urem <64 x i8> %op1, %op2
@@ -1026,54 +996,26 @@ define void @urem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @urem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @urem_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v128i8:
-
-; FULL VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_1024-NEXT: udivr [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b
-; VBITS_EQ_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_2048-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_2048-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_EQ_2048-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: uunpklo z2.h, z1.b
+; CHECK-NEXT: uunpklo z3.h, z0.b
+; CHECK-NEXT: uunpkhi z4.s, z2.h
+; CHECK-NEXT: uunpkhi z5.s, z3.h
+; CHECK-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEXT: uunpklo z3.s, z3.h
+; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h
+; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = urem <128 x i8> %op1, %op2
@@ -1081,34 +1023,35 @@ define void @urem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @urem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @urem_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v256i8:
-; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: udivr [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: udivr [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: udiv [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_2048-NEXT: udivr [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[ZIP:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b
-; VBITS_EQ_2048-NEXT: mls [[OP1]].b, [[PG]]/m, [[ZIP]].b, [[OP2]].b
-; VBITS_EQ_2048-NEXT: st1b { [[OP1]].b }, [[PG]], [x0]
-; VBITS_EQ_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: uunpkhi z2.h, z1.b
+; CHECK-NEXT: uunpkhi z3.h, z0.b
+; CHECK-NEXT: uunpklo z4.h, z1.b
+; CHECK-NEXT: uunpklo z5.h, z0.b
+; CHECK-NEXT: uunpkhi z6.s, z2.h
+; CHECK-NEXT: uunpkhi z7.s, z3.h
+; CHECK-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEXT: uunpklo z3.s, z3.h
+; CHECK-NEXT: udivr z6.s, p1/m, z6.s, z7.s
+; CHECK-NEXT: uunpkhi z7.s, z4.h
+; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uunpkhi z3.s, z5.h
+; CHECK-NEXT: uunpklo z4.s, z4.h
+; CHECK-NEXT: uunpklo z5.s, z5.h
+; CHECK-NEXT: udiv z3.s, p1/m, z3.s, z7.s
+; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h
+; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h
+; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
+; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = urem <256 x i8> %op1, %op2
@@ -1119,92 +1062,154 @@ define void @urem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
; Vector vXi16 udiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
-; CHECK-LABEL: urem_v4i16:
-; CHECK: ushll v2.4s, v1.4h, #0
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
-; CHECK-NEXT: ushll v3.4s, v0.4h, #0
-; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, z2.s, z3.s
-; CHECK-NEXT: mov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].s[1]
-; CHECK-NEXT: mov [[SCALAR2:w[0-9]+]], [[VEC]].s[2]
-; CHECK-NEXT: mov v3.16b, v2.16b
-; CHECK-NEXT: mov [[VECO:v[0-9]+]].h[1], [[SCALAR1]]
-; CHECK-NEXT: mov [[SCALAR3:w[0-9]+]], [[VEC]].s[3]
-; CHECK-NEXT: mov [[VECO]].h[2], [[SCALAR2]]
-; CHECK-NEXT: mov [[VECO]].h[3], [[SCALAR3]]
-; CHECK-NEXT: mls v0.4h, [[VECO]].4h, v1.4h
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: urem_v4i16:
-; VBITS_EQ_128: ushll v2.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: ushll v3.4s, v0.4h, #0
-; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: xtn v2.4h, v2.4s
-; VBITS_EQ_128-NEXT: mls v0.4h, v2.4h, v1.4h
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: urem_v4i16:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ushll v2.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: ushll v3.4s, v0.4h, #0
+; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: xtn v2.4h, v2.4s
+; VBITS_GE_128-NEXT: mls v0.4h, v2.4h, v1.4h
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: urem_v4i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ushll v2.4s, v1.4h, #0
+; VBITS_GE_256-NEXT: ptrue p0.s, vl4
+; VBITS_GE_256-NEXT: ushll v3.4s, v0.4h, #0
+; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: mov w8, v2.s[1]
+; VBITS_GE_256-NEXT: mov w9, v2.s[2]
+; VBITS_GE_256-NEXT: mov v3.16b, v2.16b
+; VBITS_GE_256-NEXT: mov v3.h[1], w8
+; VBITS_GE_256-NEXT: mov w8, v2.s[3]
+; VBITS_GE_256-NEXT: mov v3.h[2], w9
+; VBITS_GE_256-NEXT: mov v3.h[3], w8
+; VBITS_GE_256-NEXT: mls v0.4h, v3.4h, v1.4h
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: urem_v4i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ushll v2.4s, v1.4h, #0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl4
+; VBITS_GE_512-NEXT: ushll v3.4s, v0.4h, #0
+; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT: mov w8, v2.s[1]
+; VBITS_GE_512-NEXT: mov w9, v2.s[2]
+; VBITS_GE_512-NEXT: mov v3.16b, v2.16b
+; VBITS_GE_512-NEXT: mov v3.h[1], w8
+; VBITS_GE_512-NEXT: mov w8, v2.s[3]
+; VBITS_GE_512-NEXT: mov v3.h[2], w9
+; VBITS_GE_512-NEXT: mov v3.h[3], w8
+; VBITS_GE_512-NEXT: mls v0.4h, v3.4h, v1.4h
+; VBITS_GE_512-NEXT: ret
%res = urem <4 x i16> %op1, %op2
ret <4 x i16> %res
}
define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
-; CHECK-LABEL: urem_v8i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: urem_v8i16:
-; VBITS_EQ_128: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: uunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT: uunpkhi z3.s, z0.h
-; VBITS_EQ_128-NEXT: uunpklo z4.s, z1.h
-; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT: uunpklo z5.s, z0.h
-; VBITS_EQ_128-NEXT: movprfx z3, z5
-; VBITS_EQ_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s
-; VBITS_EQ_128-NEXT: uzp1 z2.h, z3.h, z2.h
-; VBITS_EQ_128-NEXT: mls v0.8h, v2.8h, v1.8h
-; VBITS_EQ_128-NEXT: ret
-
+; VBITS_GE_128-LABEL: urem_v8i16:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h
+; VBITS_GE_128-NEXT: uunpkhi z3.s, z0.h
+; VBITS_GE_128-NEXT: uunpklo z4.s, z1.h
+; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT: uunpklo z5.s, z0.h
+; VBITS_GE_128-NEXT: movprfx z3, z5
+; VBITS_GE_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s
+; VBITS_GE_128-NEXT: uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: urem_v8i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT: mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: urem_v8i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h
+; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h
+; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT: mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT: ret
%res = urem <8 x i16> %op1, %op2
ret <8 x i16> %res
}
define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; CHECK-LABEL: urem_v16i16:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_256-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_EQ_256-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_512-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-; CHECK: ret
+; VBITS_GE_128-LABEL: urem_v16i16:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ldp q2, q0, [x0]
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: uunpkhi z17.s, z2.h
+; VBITS_GE_128-NEXT: ldp q3, q1, [x1]
+; VBITS_GE_128-NEXT: uunpkhi z5.s, z0.h
+; VBITS_GE_128-NEXT: uunpklo z7.s, z0.h
+; VBITS_GE_128-NEXT: uunpkhi z16.s, z3.h
+; VBITS_GE_128-NEXT: udivr z16.s, p0/m, z16.s, z17.s
+; VBITS_GE_128-NEXT: uunpkhi z4.s, z1.h
+; VBITS_GE_128-NEXT: uunpklo z6.s, z1.h
+; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT: uunpklo z5.s, z3.h
+; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s
+; VBITS_GE_128-NEXT: uunpklo z7.s, z2.h
+; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z7.s
+; VBITS_GE_128-NEXT: uzp1 z4.h, z6.h, z4.h
+; VBITS_GE_128-NEXT: uzp1 z5.h, z5.h, z16.h
+; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v3.8h
+; VBITS_GE_128-NEXT: mls v0.8h, v4.8h, v1.8h
+; VBITS_GE_128-NEXT: stp q2, q0, [x0]
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: urem_v16i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ptrue p1.s, vl8
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h
+; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_256-NEXT: uunpklo z4.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z5.s, z0.h
+; VBITS_GE_256-NEXT: movprfx z3, z5
+; VBITS_GE_256-NEXT: udiv z3.s, p1/m, z3.s, z4.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: urem_v16i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: ptrue p1.s, vl16
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h
+; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h
+; VBITS_GE_512-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = urem <16 x i16> %op1, %op2
@@ -1212,37 +1217,20 @@ define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
ret void
}
-define void @urem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+define void @urem_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: urem_v32i16:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_512-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_EQ_512-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpklo z3.s, z0.h
+; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = urem <32 x i16> %op1, %op2
@@ -1250,35 +1238,20 @@ define void @urem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @urem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @urem_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v64i16:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_1024-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_EQ_1024-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpklo z3.s, z0.h
+; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = urem <64 x i16> %op1, %op2
@@ -1286,23 +1259,24 @@ define void @urem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @urem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @urem_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v128i16:
-; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: udivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: movprfx [[RES_LO:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_2048-NEXT: udiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[RES_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[ZIP:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
-; VBITS_EQ_2048-NEXT: mls [[OP1]].h, [[PG]]/m, [[ZIP]].h, [[OP2]].h
-; VBITS_EQ_2048-NEXT: st1h { [[OP1]].h }, [[PG]], [x0]
-; VBITS_EQ_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ptrue p1.s, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT: uunpklo z4.s, z1.h
+; CHECK-NEXT: uunpklo z5.s, z0.h
+; CHECK-NEXT: movprfx z3, z5
+; CHECK-NEXT: udiv z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = urem <128 x i16> %op1, %op2
@@ -1311,55 +1285,48 @@ define void @urem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Vector v2i32 udiv are not legal for NEON so use SVE when available.
-define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: urem_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], z0
-; CHECK-NEXT: udiv z2.s, [[PG]]/m, [[PFX]].s, z1.s
-; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: urem_v2i32:
-; VBITS_EQ_128: ptrue p0.s, vl2
-; VBITS_EQ_128-NEXT: movprfx z2, z0
-; VBITS_EQ_128-NEXT: udiv z2.s, p0/m, z2.s, z1.s
-; VBITS_EQ_128-NEXT: mls v0.2s, v2.2s, v1.2s
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = urem <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Vector v4i32 udiv are not legal for NEON so use SVE when available.
-define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: urem_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], z0
-; CHECK-NEXT: udiv z2.s, [[PG]]/m, [[PFX]].s, z1.s
-; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
-; CHECK-NEXT: ret
-
-; VBITS_EQ_128-LABEL: urem_v4i32:
-; VBITS_EQ_128: ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT: movprfx z2, z0
-; VBITS_EQ_128-NEXT: udiv z2.s, p0/m, z2.s, z1.s
-; VBITS_EQ_128-NEXT: mls v0.4s, v2.4s, v1.4s
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = urem <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @urem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @urem_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: urem_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; CHECK-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = urem <8 x i32> %op1, %op2
@@ -1368,15 +1335,57 @@ define void @urem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @urem_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: urem_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_512-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: urem_v16i32:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: ptrue p0.s, vl4
+; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32]
+; VBITS_GE_128-NEXT: ldp q7, q6, [x1]
+; VBITS_GE_128-NEXT: movprfx z16, z0
+; VBITS_GE_128-NEXT: udiv z16.s, p0/m, z16.s, z4.s
+; VBITS_GE_128-NEXT: mls v0.4s, v16.4s, v4.4s
+; VBITS_GE_128-NEXT: movprfx z4, z3
+; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z6.s
+; VBITS_GE_128-NEXT: movprfx z16, z1
+; VBITS_GE_128-NEXT: udiv z16.s, p0/m, z16.s, z5.s
+; VBITS_GE_128-NEXT: mls v1.4s, v16.4s, v5.4s
+; VBITS_GE_128-NEXT: movprfx z5, z2
+; VBITS_GE_128-NEXT: udiv z5.s, p0/m, z5.s, z7.s
+; VBITS_GE_128-NEXT: mls v2.4s, v5.4s, v7.4s
+; VBITS_GE_128-NEXT: mls v3.4s, v4.4s, v6.4s
+; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: stp q2, q3, [x0]
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: urem_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: movprfx z4, z0
+; VBITS_GE_256-NEXT: udiv z4.s, p0/m, z4.s, z2.s
+; VBITS_GE_256-NEXT: movprfx z5, z1
+; VBITS_GE_256-NEXT: udiv z5.s, p0/m, z5.s, z3.s
+; VBITS_GE_256-NEXT: mls z0.s, p0/m, z4.s, z2.s
+; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: urem_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: movprfx z2, z0
+; VBITS_GE_512-NEXT: udiv z2.s, p0/m, z2.s, z1.s
+; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = urem <16 x i32> %op1, %op2
@@ -1384,16 +1393,17 @@ define void @urem_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @urem_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @urem_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: urem_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_1024-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = urem <32 x i32> %op1, %op2
@@ -1401,16 +1411,17 @@ define void @urem_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @urem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @urem_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_2048-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = urem <64 x i32> %op1, %op2
@@ -1420,60 +1431,49 @@ define void @urem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
; Vector i64 udiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for the 128 bits case here.
-define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: urem_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, z1.d
-; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d
-; CHECK-NEXT: sub d0, d0, d1
-; CHECK-NEXT: ret
-
-; VBITS_EQ_128-LABEL: urem_v1i64:
-; VBITS_EQ_128: ptrue p0.d, vl1
-; VBITS_EQ_128-NEXT: movprfx z2, z0
-; VBITS_EQ_128-NEXT: udiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d
-; VBITS_EQ_128-NEXT: sub d0, d0, d1
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: sub d0, d0, d1
+; CHECK-NEXT: ret
%res = urem <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Vector i64 udiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for the 128 bits case here.
-define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: urem_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, z1.d
-; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d
-; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
-
-; VBITS_EQ_128-LABEL: urem_v2i64:
-; VBITS_EQ_128: ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT: movprfx z2, z0
-; VBITS_EQ_128-NEXT: udiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d
-; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d
-; VBITS_EQ_128-NEXT: ret
-
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = urem <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @urem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @urem_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: urem_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; CHECK-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = urem <4 x i64> %op1, %op2
@@ -1482,15 +1482,61 @@ define void @urem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @urem_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: urem_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_512-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: urem_v8i64:
+; VBITS_GE_128: // %bb.0:
+; VBITS_GE_128-NEXT: ldp q4, q5, [x1]
+; VBITS_GE_128-NEXT: ptrue p0.d, vl2
+; VBITS_GE_128-NEXT: ldp q7, q6, [x1, #32]
+; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT: movprfx z16, z3
+; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z5.d
+; VBITS_GE_128-NEXT: movprfx z17, z2
+; VBITS_GE_128-NEXT: udiv z17.d, p0/m, z17.d, z4.d
+; VBITS_GE_128-NEXT: mul z5.d, p0/m, z5.d, z16.d
+; VBITS_GE_128-NEXT: movprfx z16, z1
+; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z6.d
+; VBITS_GE_128-NEXT: mul z4.d, p0/m, z4.d, z17.d
+; VBITS_GE_128-NEXT: movprfx z17, z0
+; VBITS_GE_128-NEXT: udiv z17.d, p0/m, z17.d, z7.d
+; VBITS_GE_128-NEXT: mul z6.d, p0/m, z6.d, z16.d
+; VBITS_GE_128-NEXT: mul z7.d, p0/m, z7.d, z17.d
+; VBITS_GE_128-NEXT: sub v0.2d, v0.2d, v7.2d
+; VBITS_GE_128-NEXT: sub v1.2d, v1.2d, v6.2d
+; VBITS_GE_128-NEXT: sub v2.2d, v2.2d, v4.2d
+; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT: sub v0.2d, v3.2d, v5.2d
+; VBITS_GE_128-NEXT: stp q2, q0, [x0]
+; VBITS_GE_128-NEXT: ret
+;
+; VBITS_GE_256-LABEL: urem_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: movprfx z4, z0
+; VBITS_GE_256-NEXT: udiv z4.d, p0/m, z4.d, z2.d
+; VBITS_GE_256-NEXT: movprfx z5, z1
+; VBITS_GE_256-NEXT: udiv z5.d, p0/m, z5.d, z3.d
+; VBITS_GE_256-NEXT: mls z0.d, p0/m, z4.d, z2.d
+; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: urem_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: movprfx z2, z0
+; VBITS_GE_512-NEXT: udiv z2.d, p0/m, z2.d, z1.d
+; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = urem <8 x i64> %op1, %op2
@@ -1498,16 +1544,17 @@ define void @urem_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @urem_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @urem_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: urem_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_1024-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = urem <16 x i64> %op1, %op2
@@ -1515,16 +1562,17 @@ define void @urem_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @urem_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @urem_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_2048-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = urem <32 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
index 639b4a96e364..8b76c00631bb 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
@@ -1,62 +1,50 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) #0 {
+define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8i8:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: dup v2.8b, w8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: dup v2.8b, w8
+; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: ret
%sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2
ret <8 x i8> %sel
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) #0 {
+define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16i8:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: dup v2.16b, w8
-; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: dup v2.16b, w8
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: ret
%sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2
ret <16 x i8> %sel
}
-define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) #0 {
+define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v32i8:
-; CHECK: and w[[AND:[0-9]+]], w2, #0x1
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].b
-; CHECK-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
-; CHECK-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
-; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
-; CHECK-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: mov z2.b, w8
+; CHECK-NEXT: and z2.b, z2.b, #0x1
+; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0
+; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <32 x i8>, <32 x i8>* %a
%op2 = load volatile <32 x i8>, <32 x i8>* %b
%sel = select i1 %mask, <32 x i8> %op1, <32 x i8> %op2
@@ -65,18 +53,38 @@ define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) #0 {
}
define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, i1 %mask) #0 {
-; CHECK-LABEL: select_v64i8:
-; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].b
-; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
-; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
-; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
-; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: select_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: and w9, w2, #0x1
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ptrue p1.b
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov z4.b, w9
+; VBITS_GE_256-NEXT: and z4.b, z4.b, #0x1
+; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z4.b, #0
+; VBITS_GE_256-NEXT: sel z1.b, p1, z1.b, z3.b
+; VBITS_GE_256-NEXT: sel z0.b, p1, z0.b, z2.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: select_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: and w8, w2, #0x1
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: ptrue p1.b
+; VBITS_GE_512-NEXT: mov z2.b, w8
+; VBITS_GE_512-NEXT: and z2.b, z2.b, #0x1
+; VBITS_GE_512-NEXT: cmpne p1.b, p1/z, z2.b, #0
+; VBITS_GE_512-NEXT: sel z0.b, p1, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load volatile <64 x i8>, <64 x i8>* %a
%op2 = load volatile <64 x i8>, <64 x i8>* %b
%sel = select i1 %mask, <64 x i8> %op1, <64 x i8> %op2
@@ -84,19 +92,20 @@ define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) #0 {
+define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v128i8:
-; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].b
-; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
-; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
-; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
-; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: mov z2.b, w8
+; CHECK-NEXT: and z2.b, z2.b, #0x1
+; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0
+; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <128 x i8>, <128 x i8>* %a
%op2 = load volatile <128 x i8>, <128 x i8>* %b
%sel = select i1 %mask, <128 x i8> %op1, <128 x i8> %op2
@@ -104,19 +113,20 @@ define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) #0 {
+define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v256i8:
-; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].b
-; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
-; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
-; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
-; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: mov z2.b, w8
+; CHECK-NEXT: and z2.b, z2.b, #0x1
+; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0
+; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <256 x i8>, <256 x i8>* %a
%op2 = load volatile <256 x i8>, <256 x i8>* %b
%sel = select i1 %mask, <256 x i8> %op1, <256 x i8> %op2
@@ -125,42 +135,45 @@ define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) #0 {
+define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4i16:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: dup v2.4h, w8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: dup v2.4h, w8
+; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: ret
%sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2
ret <4 x i16> %sel
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) #0 {
+define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8i16:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: dup v2.8h, w8
-; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: dup v2.8h, w8
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: ret
%sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2
ret <8 x i16> %sel
}
-define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) #0 {
+define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16i16:
-; CHECK: and w[[AND:[0-9]+]], w2, #0x1
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h
-; CHECK-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
-; CHECK-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
-; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
-; CHECK-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: mov z2.h, w8
+; CHECK-NEXT: and z2.h, z2.h, #0x1
+; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
+; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <16 x i16>, <16 x i16>* %a
%op2 = load volatile <16 x i16>, <16 x i16>* %b
%sel = select i1 %mask, <16 x i16> %op1, <16 x i16> %op2
@@ -169,18 +182,38 @@ define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) #0 {
}
define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, i1 %mask) #0 {
-; CHECK-LABEL: select_v32i16:
-; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h
-; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
-; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
-; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
-; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: select_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: and w9, w2, #0x1
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ptrue p1.h
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov z4.h, w9
+; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1
+; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z4.h, #0
+; VBITS_GE_256-NEXT: sel z1.h, p1, z1.h, z3.h
+; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z2.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: select_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: and w8, w2, #0x1
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: ptrue p1.h
+; VBITS_GE_512-NEXT: mov z2.h, w8
+; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1
+; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0
+; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load volatile <32 x i16>, <32 x i16>* %a
%op2 = load volatile <32 x i16>, <32 x i16>* %b
%sel = select i1 %mask, <32 x i16> %op1, <32 x i16> %op2
@@ -188,19 +221,20 @@ define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) #0 {
+define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v64i16:
-; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].h
-; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
-; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
-; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
-; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: mov z2.h, w8
+; CHECK-NEXT: and z2.h, z2.h, #0x1
+; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
+; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <64 x i16>, <64 x i16>* %a
%op2 = load volatile <64 x i16>, <64 x i16>* %b
%sel = select i1 %mask, <64 x i16> %op1, <64 x i16> %op2
@@ -208,19 +242,20 @@ define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) #0 {
+define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v128i16:
-; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].h
-; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
-; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
-; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
-; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: mov z2.h, w8
+; CHECK-NEXT: and z2.h, z2.h, #0x1
+; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
+; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <128 x i16>, <128 x i16>* %a
%op2 = load volatile <128 x i16>, <128 x i16>* %b
%sel = select i1 %mask, <128 x i16> %op1, <128 x i16> %op2
@@ -229,42 +264,45 @@ define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) #0 {
+define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2i32:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: dup v2.2s, w8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: dup v2.2s, w8
+; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: ret
%sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
ret <2 x i32> %sel
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) #0 {
+define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4i32:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: dup v2.4s, w8
-; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: dup v2.4s, w8
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: ret
%sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
ret <4 x i32> %sel
}
-define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) #0 {
+define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8i32:
-; CHECK: and w[[AND:[0-9]+]], w2, #0x1
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s
-; CHECK-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
-; CHECK-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
-; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
-; CHECK-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mov z2.s, w8
+; CHECK-NEXT: and z2.s, z2.s, #0x1
+; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
+; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <8 x i32>, <8 x i32>* %a
%op2 = load volatile <8 x i32>, <8 x i32>* %b
%sel = select i1 %mask, <8 x i32> %op1, <8 x i32> %op2
@@ -273,18 +311,38 @@ define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) #0 {
}
define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, i1 %mask) #0 {
-; CHECK-LABEL: select_v16i32:
-; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
-; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
-; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
-; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: select_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: and w9, w2, #0x1
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ptrue p1.s
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov z4.s, w9
+; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1
+; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z4.s, #0
+; VBITS_GE_256-NEXT: sel z1.s, p1, z1.s, z3.s
+; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z2.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: select_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: and w8, w2, #0x1
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: ptrue p1.s
+; VBITS_GE_512-NEXT: mov z2.s, w8
+; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1
+; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0
+; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load volatile <16 x i32>, <16 x i32>* %a
%op2 = load volatile <16 x i32>, <16 x i32>* %b
%sel = select i1 %mask, <16 x i32> %op1, <16 x i32> %op2
@@ -292,19 +350,20 @@ define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) #0 {
+define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v32i32:
-; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
-; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
-; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
-; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mov z2.s, w8
+; CHECK-NEXT: and z2.s, z2.s, #0x1
+; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
+; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <32 x i32>, <32 x i32>* %a
%op2 = load volatile <32 x i32>, <32 x i32>* %b
%sel = select i1 %mask, <32 x i32> %op1, <32 x i32> %op2
@@ -312,19 +371,20 @@ define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) #0 {
+define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v64i32:
-; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
-; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
-; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
-; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mov z2.s, w8
+; CHECK-NEXT: and z2.s, z2.s, #0x1
+; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
+; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <64 x i32>, <64 x i32>* %a
%op2 = load volatile <64 x i32>, <64 x i32>* %b
%sel = select i1 %mask, <64 x i32> %op1, <64 x i32> %op2
@@ -333,42 +393,45 @@ define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) #0 {
+define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v1i64:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm x8, ne
-; CHECK-NEXT: fmov d2, x8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: csetm x8, ne
+; CHECK-NEXT: fmov d2, x8
+; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: ret
%sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
ret <1 x i64> %sel
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) #0 {
+define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2i64:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm x8, ne
-; CHECK-NEXT: dup v2.2d, x8
-; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: csetm x8, ne
+; CHECK-NEXT: dup v2.2d, x8
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: ret
%sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
ret <2 x i64> %sel
}
-define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) #0 {
+define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4i64:
-; CHECK: and w[[AND:[0-9]+]], w2, #0x1
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
-; CHECK-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
-; CHECK-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
-; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
-; CHECK-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: mov z2.d, x8
+; CHECK-NEXT: and z2.d, z2.d, #0x1
+; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
+; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <4 x i64>, <4 x i64>* %a
%op2 = load volatile <4 x i64>, <4 x i64>* %b
%sel = select i1 %mask, <4 x i64> %op1, <4 x i64> %op2
@@ -377,18 +440,38 @@ define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) #0 {
}
define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, i1 %mask) #0 {
-; CHECK-LABEL: select_v8i64:
-; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
-; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
-; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
-; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: select_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: and w9, w2, #0x1
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ptrue p1.d
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: mov z4.d, x9
+; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1
+; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z4.d, #0
+; VBITS_GE_256-NEXT: sel z1.d, p1, z1.d, z3.d
+; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: select_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: and w8, w2, #0x1
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: ptrue p1.d
+; VBITS_GE_512-NEXT: mov z2.d, x8
+; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1
+; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0
+; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load volatile <8 x i64>, <8 x i64>* %a
%op2 = load volatile <8 x i64>, <8 x i64>* %b
%sel = select i1 %mask, <8 x i64> %op1, <8 x i64> %op2
@@ -396,19 +479,20 @@ define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) #0 {
+define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v16i64:
-; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
-; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
-; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
-; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: mov z2.d, x8
+; CHECK-NEXT: and z2.d, z2.d, #0x1
+; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
+; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <16 x i64>, <16 x i64>* %a
%op2 = load volatile <16 x i64>, <16 x i64>* %b
%sel = select i1 %mask, <16 x i64> %op1, <16 x i64> %op2
@@ -416,19 +500,20 @@ define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) #0 {
ret void
}
-define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, i1 %mask) #0 {
+define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v32i64:
-; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
-; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
-; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
-; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0x1
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: mov z2.d, x8
+; CHECK-NEXT: and z2.d, z2.d, #0x1
+; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
+; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load volatile <32 x i64>, <32 x i64>* %a
%op2 = load volatile <32 x i64>, <32 x i64>* %b
%sel = select i1 %mask, <32 x i64> %op1, <32 x i64> %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
index 4cbc916a59cd..23e37d3c8ad0 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
@@ -1,57 +1,45 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; ASHR
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: ashr_v8i8:
-; CHECK: neg v1.8b, v1.8b
-; CHECK-NEXT: sshl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.8b, v1.8b
+; CHECK-NEXT: sshl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = ashr <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: ashr_v16i8:
-; CHECK: neg v1.16b, v1.16b
-; CHECK-NEXT: sshl v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.16b, v1.16b
+; CHECK-NEXT: sshl v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = ashr <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @ashr_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @ashr_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: ashr_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: asr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = ashr <32 x i8> %op1, %op2
@@ -60,26 +48,28 @@ define void @ashr_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @ashr_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: ashr_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: asr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: asr [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: asr [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ashr_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: asr z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT: asr z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: ashr_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: asr z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = ashr <64 x i8> %op1, %op2
@@ -87,14 +77,15 @@ define void @ashr_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @ashr_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @ashr_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: ashr_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: asr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = ashr <128 x i8> %op1, %op2
@@ -102,14 +93,15 @@ define void @ashr_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @ashr_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @ashr_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: ashr_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: asr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = ashr <256 x i8> %op1, %op2
@@ -118,33 +110,36 @@ define void @ashr_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: ashr_v4i16:
-; CHECK: neg v1.4h, v1.4h
-; CHECK-NEXT: sshl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.4h, v1.4h
+; CHECK-NEXT: sshl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = ashr <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: ashr_v8i16:
-; CHECK: neg v1.8h, v1.8h
-; CHECK-NEXT: sshl v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.8h, v1.8h
+; CHECK-NEXT: sshl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = ashr <8 x i16> %op1, %op2
ret <8 x i16> %res
}
-define void @ashr_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @ashr_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: ashr_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: asr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = ashr <16 x i16> %op1, %op2
@@ -153,26 +148,28 @@ define void @ashr_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @ashr_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: ashr_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: asr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: asr [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: asr [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ashr_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: asr z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: asr z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: ashr_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: asr z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = ashr <32 x i16> %op1, %op2
@@ -180,14 +177,15 @@ define void @ashr_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @ashr_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @ashr_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: ashr_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: asr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = ashr <64 x i16> %op1, %op2
@@ -195,14 +193,15 @@ define void @ashr_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @ashr_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @ashr_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: ashr_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: asr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = ashr <128 x i16> %op1, %op2
@@ -211,33 +210,36 @@ define void @ashr_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: ashr_v2i32:
-; CHECK: neg v1.2s, v1.2s
-; CHECK-NEXT: sshl v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.2s, v1.2s
+; CHECK-NEXT: sshl v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = ashr <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: ashr_v4i32:
-; CHECK: neg v1.4s, v1.4s
-; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.4s, v1.4s
+; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = ashr <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @ashr_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @ashr_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: ashr_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: asr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = ashr <8 x i32> %op1, %op2
@@ -246,26 +248,28 @@ define void @ashr_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @ashr_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: ashr_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: asr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: asr [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: asr [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ashr_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: asr z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: asr z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: ashr_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: asr z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = ashr <16 x i32> %op1, %op2
@@ -273,14 +277,15 @@ define void @ashr_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @ashr_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @ashr_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: ashr_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: asr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = ashr <32 x i32> %op1, %op2
@@ -288,14 +293,15 @@ define void @ashr_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @ashr_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @ashr_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: ashr_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: asr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = ashr <64 x i32> %op1, %op2
@@ -304,33 +310,36 @@ define void @ashr_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: ashr_v1i64:
-; CHECK: neg d1, d1
-; CHECK-NEXT: sshl d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg d1, d1
+; CHECK-NEXT: sshl d0, d0, d1
+; CHECK-NEXT: ret
%res = ashr <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: ashr_v2i64:
-; CHECK: neg v1.2d, v1.2d
-; CHECK-NEXT: sshl v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.2d, v1.2d
+; CHECK-NEXT: sshl v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = ashr <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @ashr_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @ashr_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: ashr_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: asr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = ashr <4 x i64> %op1, %op2
@@ -339,26 +348,28 @@ define void @ashr_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @ashr_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: ashr_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: asr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: asr [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: asr [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ashr_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: asr z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: asr z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: ashr_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: asr z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = ashr <8 x i64> %op1, %op2
@@ -366,14 +377,15 @@ define void @ashr_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @ashr_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @ashr_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: ashr_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: asr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = ashr <16 x i64> %op1, %op2
@@ -381,14 +393,15 @@ define void @ashr_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @ashr_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @ashr_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: ashr_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: asr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = ashr <32 x i64> %op1, %op2
@@ -401,33 +414,36 @@ define void @ashr_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: lshr_v8i8:
-; CHECK: neg v1.8b, v1.8b
-; CHECK-NEXT: ushl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.8b, v1.8b
+; CHECK-NEXT: ushl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = lshr <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: lshr_v16i8:
-; CHECK: neg v1.16b, v1.16b
-; CHECK-NEXT: ushl v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.16b, v1.16b
+; CHECK-NEXT: ushl v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = lshr <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @lshr_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @lshr_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: lshr_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = lshr <32 x i8> %op1, %op2
@@ -436,26 +452,28 @@ define void @lshr_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @lshr_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: lshr_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: lsr [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: lsr [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: lshr_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: lsr z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT: lsr z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: lshr_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: lsr z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = lshr <64 x i8> %op1, %op2
@@ -463,14 +481,15 @@ define void @lshr_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @lshr_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @lshr_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: lshr_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = lshr <128 x i8> %op1, %op2
@@ -478,14 +497,15 @@ define void @lshr_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @lshr_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @lshr_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: lshr_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = lshr <256 x i8> %op1, %op2
@@ -494,33 +514,36 @@ define void @lshr_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: lshr_v4i16:
-; CHECK: neg v1.4h, v1.4h
-; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.4h, v1.4h
+; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = lshr <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: lshr_v8i16:
-; CHECK: neg v1.8h, v1.8h
-; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.8h, v1.8h
+; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = lshr <8 x i16> %op1, %op2
ret <8 x i16> %res
}
-define void @lshr_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @lshr_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: lshr_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = lshr <16 x i16> %op1, %op2
@@ -529,26 +552,28 @@ define void @lshr_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @lshr_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: lshr_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: lsr [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: lsr [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: lshr_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: lsr z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: lsr z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: lshr_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: lsr z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = lshr <32 x i16> %op1, %op2
@@ -556,14 +581,15 @@ define void @lshr_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @lshr_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @lshr_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: lshr_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = lshr <64 x i16> %op1, %op2
@@ -571,14 +597,15 @@ define void @lshr_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @lshr_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @lshr_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: lshr_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = lshr <128 x i16> %op1, %op2
@@ -587,33 +614,36 @@ define void @lshr_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: lshr_v2i32:
-; CHECK: neg v1.2s, v1.2s
-; CHECK-NEXT: ushl v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.2s, v1.2s
+; CHECK-NEXT: ushl v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = lshr <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: lshr_v4i32:
-; CHECK: neg v1.4s, v1.4s
-; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.4s, v1.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = lshr <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @lshr_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @lshr_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: lshr_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = lshr <8 x i32> %op1, %op2
@@ -622,26 +652,28 @@ define void @lshr_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @lshr_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: lshr_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: lsr [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: lsr [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: lshr_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: lsr z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: lsr z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: lshr_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: lsr z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = lshr <16 x i32> %op1, %op2
@@ -649,14 +681,15 @@ define void @lshr_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @lshr_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @lshr_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: lshr_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = lshr <32 x i32> %op1, %op2
@@ -664,14 +697,15 @@ define void @lshr_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @lshr_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @lshr_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: lshr_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = lshr <64 x i32> %op1, %op2
@@ -680,33 +714,36 @@ define void @lshr_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: lshr_v1i64:
-; CHECK: neg d1, d1
-; CHECK-NEXT: ushl d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg d1, d1
+; CHECK-NEXT: ushl d0, d0, d1
+; CHECK-NEXT: ret
%res = lshr <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: lshr_v2i64:
-; CHECK: neg v1.2d, v1.2d
-; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.2d, v1.2d
+; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = lshr <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @lshr_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @lshr_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: lshr_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = lshr <4 x i64> %op1, %op2
@@ -715,26 +752,28 @@ define void @lshr_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @lshr_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: lshr_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: lsr [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: lsr [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: lshr_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: lsr z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: lsr z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: lshr_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: lsr z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = lshr <8 x i64> %op1, %op2
@@ -742,14 +781,15 @@ define void @lshr_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @lshr_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @lshr_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: lshr_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = lshr <16 x i64> %op1, %op2
@@ -757,14 +797,15 @@ define void @lshr_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @lshr_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @lshr_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: lshr_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = lshr <32 x i64> %op1, %op2
@@ -777,31 +818,34 @@ define void @lshr_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shl_v8i8:
-; CHECK: ushl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%res = shl <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shl_v16i8:
-; CHECK: ushl v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushl v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%res = shl <16 x i8> %op1, %op2
ret <16 x i8> %res
}
-define void @shl_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @shl_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shl_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsl [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = shl <32 x i8> %op1, %op2
@@ -810,26 +854,28 @@ define void @shl_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @shl_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: shl_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsl [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: lsl [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: lsl [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: shl_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: lsl z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT: lsl z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: shl_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: lsl z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = shl <64 x i8> %op1, %op2
@@ -837,14 +883,15 @@ define void @shl_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @shl_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @shl_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: shl_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsl [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = shl <128 x i8> %op1, %op2
@@ -852,14 +899,15 @@ define void @shl_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @shl_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @shl_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: shl_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsl [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = shl <256 x i8> %op1, %op2
@@ -868,31 +916,34 @@ define void @shl_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shl_v4i16:
-; CHECK: ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%res = shl <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shl_v8i16:
-; CHECK: ushl v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%res = shl <8 x i16> %op1, %op2
ret <8 x i16> %res
}
-define void @shl_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @shl_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shl_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsl [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = shl <16 x i16> %op1, %op2
@@ -901,26 +952,28 @@ define void @shl_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @shl_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: shl_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsl [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: lsl [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: lsl [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: shl_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: lsl z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT: lsl z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: shl_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: lsl z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = shl <32 x i16> %op1, %op2
@@ -928,14 +981,15 @@ define void @shl_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @shl_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @shl_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: shl_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsl [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = shl <64 x i16> %op1, %op2
@@ -943,14 +997,15 @@ define void @shl_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @shl_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @shl_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: shl_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsl [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = shl <128 x i16> %op1, %op2
@@ -959,31 +1014,34 @@ define void @shl_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shl_v2i32:
-; CHECK: ushl v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushl v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%res = shl <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shl_v4i32:
-; CHECK: ushl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%res = shl <4 x i32> %op1, %op2
ret <4 x i32> %res
}
-define void @shl_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @shl_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shl_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsl [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = shl <8 x i32> %op1, %op2
@@ -992,26 +1050,28 @@ define void @shl_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @shl_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: shl_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsl [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: lsl [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: lsl [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: shl_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: lsl z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT: lsl z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: shl_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: lsl z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = shl <16 x i32> %op1, %op2
@@ -1019,14 +1079,15 @@ define void @shl_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @shl_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @shl_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: shl_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsl [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = shl <32 x i32> %op1, %op2
@@ -1034,14 +1095,15 @@ define void @shl_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @shl_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @shl_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: shl_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsl [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = shl <64 x i32> %op1, %op2
@@ -1050,31 +1112,34 @@ define void @shl_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shl_v1i64:
-; CHECK: ushl d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushl d0, d0, d1
+; CHECK-NEXT: ret
%res = shl <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shl_v2i64:
-; CHECK: ushl v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%res = shl <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-define void @shl_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @shl_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shl_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsl [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = shl <4 x i64> %op1, %op2
@@ -1083,26 +1148,28 @@ define void @shl_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @shl_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: shl_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsl [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: lsl [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: lsl [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: shl_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: lsl z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT: lsl z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: shl_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: lsl z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = shl <8 x i64> %op1, %op2
@@ -1110,14 +1177,15 @@ define void @shl_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @shl_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @shl_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: shl_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsl [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = shl <16 x i64> %op1, %op2
@@ -1125,14 +1193,15 @@ define void @shl_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @shl_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @shl_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: shl_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsl [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = shl <32 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index 7de5eefd74db..a0c4b4313917 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) #0 {
+define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v4i16_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ucvtf v0.4h, v0.4h
@@ -34,7 +20,7 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define void @ucvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) #0 {
+define void @ucvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v8i16_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -47,7 +33,7 @@ define void @ucvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) #0 {
ret void
}
-define void @ucvtf_v16i16_v16f16(<16 x i16>* %a, <16 x half>* %b) #0 {
+define void @ucvtf_v16i16_v16f16(<16 x i16>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v16i16_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -87,84 +73,28 @@ define void @ucvtf_v32i16_v32f16(<32 x i16>* %a, <32 x half>* %b) #0 {
ret void
}
-define void @ucvtf_v64i16_v64f16(<64 x i16>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v64i16_v64f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #48
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #32
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.h
-; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.h
-; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z2.h
-; VBITS_GE_256-NEXT: ucvtf z3.h, p0/m, z3.h
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v64i16_v64f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ucvtf z0.h, p0/m, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @ucvtf_v64i16_v64f16(<64 x i16>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v64i16_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%res = uitofp <64 x i16> %op1 to <64 x half>
store <64 x half> %res, <64 x half>* %b
ret void
}
-define void @ucvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v128i16_v128f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #96
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x11, #80
-; VBITS_GE_256-NEXT: mov x12, #32
-; VBITS_GE_256-NEXT: mov x13, #112
-; VBITS_GE_256-NEXT: mov x14, #64
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.h
-; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.h
-; VBITS_GE_256-NEXT: ucvtf z3.h, p0/m, z3.h
-; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z2.h
-; VBITS_GE_256-NEXT: ucvtf z5.h, p0/m, z5.h
-; VBITS_GE_256-NEXT: ucvtf z4.h, p0/m, z4.h
-; VBITS_GE_256-NEXT: ucvtf z6.h, p0/m, z6.h
-; VBITS_GE_256-NEXT: ucvtf z7.h, p0/m, z7.h
-; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v128i16_v128f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ucvtf z0.h, p0/m, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @ucvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v128i16_v128f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%res = uitofp <128 x i16> %op1 to <128 x half>
store <128 x half> %res, <128 x half>* %b
@@ -176,7 +106,7 @@ define void @ucvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) #0 {
+define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v2i16_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: movi d1, #0x00ffff0000ffff
@@ -188,7 +118,7 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) #0 {
+define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v4i16_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
@@ -198,7 +128,7 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) #0 {
ret <4 x float> %res
}
-define void @ucvtf_v8i16_v8f32(<8 x i16>* %a, <8 x float>* %b) #0 {
+define void @ucvtf_v8i16_v8f32(<8 x i16>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v8i16_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -244,102 +174,32 @@ define void @ucvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @ucvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i16_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: uunpklo z2.s, z0.h
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: ucvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: ucvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v32i16_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @ucvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v32i16_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%res = uitofp <32 x i16> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b
ret void
}
-define void @ucvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v64i16_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #32
-; VBITS_GE_256-NEXT: mov x10, #48
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x11, #24
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: uunpklo z4.s, z0.h
-; VBITS_GE_256-NEXT: uunpklo z5.s, z1.h
-; VBITS_GE_256-NEXT: uunpklo z6.s, z2.h
-; VBITS_GE_256-NEXT: ucvtf z4.s, p0/m, z4.s
-; VBITS_GE_256-NEXT: ucvtf z5.s, p0/m, z5.s
-; VBITS_GE_256-NEXT: ucvtf z6.s, p0/m, z6.s
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: uunpklo z7.s, z3.h
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: mov x9, #56
-; VBITS_GE_256-NEXT: mov x10, #40
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT: ucvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT: ucvtf z7.s, p0/m, z7.s
-; VBITS_GE_256-NEXT: ucvtf z3.s, p0/m, z3.s
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v64i16_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @ucvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v64i16_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%res = uitofp <64 x i16> %op1 to <64 x float>
store <64 x float> %res, <64 x float>* %b
@@ -351,7 +211,7 @@ define void @ucvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) #0 {
;
; v1i16 is perfered to be widened to v4i16, which pushes the output into SVE types, so use SVE
-define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) #0 {
+define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v1i16_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -366,7 +226,7 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) #0 {
+define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v2i16_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: movi d1, #0x00ffff0000ffff
@@ -378,7 +238,7 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) #0 {
ret <2 x double> %res
}
-define void @ucvtf_v4i16_v4f64(<4 x i16>* %a, <4 x double>* %b) #0 {
+define void @ucvtf_v4i16_v4f64(<4 x i16>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v4i16_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -426,119 +286,34 @@ define void @ucvtf_v8i16_v8f64(<8 x i16>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @ucvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v16i16_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: mov z1.d, z0.d
-; VBITS_GE_256-NEXT: uunpklo z2.s, z0.h
-; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ext v3.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: uunpklo z1.s, z3.h
-; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: uunpklo z0.d, z2.s
-; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v16i16_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @ucvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v16i16_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%res = uitofp <16 x i16> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b
ret void
}
-define void @ucvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i16_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x9, #20
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: ucvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT: uunpklo z4.s, z1.h
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: ucvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: uunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: uunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT: uunpklo z6.s, z6.h
-; VBITS_GE_256-NEXT: movprfx z0, z5
-; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z5.d
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #28
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: uunpklo z1.d, z6.s
-; VBITS_GE_256-NEXT: ucvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: uunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT: movprfx z0, z4
-; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z4.d
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v32i16_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @ucvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v32i16_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%res = uitofp <32 x i16> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b
@@ -550,7 +325,7 @@ define void @ucvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) #0 {
+define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v2i32_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -562,7 +337,7 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) #0 {
+define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v4i32_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ucvtf v0.4s, v0.4s
@@ -572,7 +347,7 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) #0 {
ret <4 x half> %res
}
-define <8 x half> @ucvtf_v8i32_v8f16(<8 x i32>* %a) #0 {
+define <8 x half> @ucvtf_v8i32_v8f16(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v8i32_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -621,110 +396,34 @@ define void @ucvtf_v16i32_v16f16(<16 x i32>* %a, <16 x half>* %b) #0 {
ret void
}
-define void @ucvtf_v32i32_v32f16(<32 x i32>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i32_v32f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ptrue p1.s
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.h, vl8
-; VBITS_GE_256-NEXT: ucvtf z0.h, p1/m, z0.s
-; VBITS_GE_256-NEXT: ucvtf z2.h, p1/m, z2.s
-; VBITS_GE_256-NEXT: ucvtf z1.h, p1/m, z1.s
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: ucvtf z3.h, p1/m, z3.s
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: splice z0.h, p0, z0.h, z1.h
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v32i32_v32f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.s
-; VBITS_GE_1024-NEXT: ucvtf z0.h, p0/m, z0.s
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @ucvtf_v32i32_v32f16(<32 x i32>* %a, <32 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v32i32_v32f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%res = uitofp <32 x i32> %op1 to <32 x half>
store <32 x half> %res, <32 x half>* %b
ret void
}
-define void @ucvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v64i32_v64f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x11, #16
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x12, #48
-; VBITS_GE_256-NEXT: mov x10, #32
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #40
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: mov x9, #56
-; VBITS_GE_256-NEXT: ptrue p1.s
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ucvtf z1.h, p1/m, z1.s
-; VBITS_GE_256-NEXT: ucvtf z2.h, p1/m, z2.s
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: ptrue p2.h, vl8
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: splice z2.h, p2, z2.h, z1.h
-; VBITS_GE_256-NEXT: movprfx z1, z6
-; VBITS_GE_256-NEXT: ucvtf z1.h, p1/m, z6.s
-; VBITS_GE_256-NEXT: ucvtf z5.h, p1/m, z5.s
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z5.h
-; VBITS_GE_256-NEXT: ucvtf z3.h, p1/m, z3.s
-; VBITS_GE_256-NEXT: ucvtf z4.h, p1/m, z4.s
-; VBITS_GE_256-NEXT: splice z5.h, p2, z5.h, z1.h
-; VBITS_GE_256-NEXT: ucvtf z0.h, p1/m, z0.s
-; VBITS_GE_256-NEXT: movprfx z1, z7
-; VBITS_GE_256-NEXT: ucvtf z1.h, p1/m, z7.s
-; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: splice z4.h, p2, z4.h, z3.h
-; VBITS_GE_256-NEXT: splice z1.h, p2, z1.h, z0.h
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v64i32_v64f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.s
-; VBITS_GE_2048-NEXT: ucvtf z0.h, p0/m, z0.s
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @ucvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v64i32_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%res = uitofp <64 x i32> %op1 to <64 x half>
store <64 x half> %res, <64 x half>* %b
@@ -736,7 +435,7 @@ define void @ucvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) #0 {
+define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v2i32_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ucvtf v0.2s, v0.2s
@@ -746,7 +445,7 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) #0 {
+define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v4i32_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ucvtf v0.4s, v0.4s
@@ -755,7 +454,7 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) #0 {
ret <4 x float> %res
}
-define void @ucvtf_v8i32_v8f32(<8 x i32>* %a, <8 x float>* %b) #0 {
+define void @ucvtf_v8i32_v8f32(<8 x i32>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v8i32_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -795,84 +494,28 @@ define void @ucvtf_v16i32_v16f32(<16 x i32>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @ucvtf_v32i32_v32f32(<32 x i32>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i32_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT: ucvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT: ucvtf z3.s, p0/m, z3.s
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v32i32_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @ucvtf_v32i32_v32f32(<32 x i32>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v32i32_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%res = uitofp <32 x i32> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b
ret void
}
-define void @ucvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v64i32_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #48
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: mov x11, #40
-; VBITS_GE_256-NEXT: mov x12, #16
-; VBITS_GE_256-NEXT: mov x13, #56
-; VBITS_GE_256-NEXT: mov x14, #32
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT: ucvtf z3.s, p0/m, z3.s
-; VBITS_GE_256-NEXT: ucvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT: ucvtf z5.s, p0/m, z5.s
-; VBITS_GE_256-NEXT: ucvtf z4.s, p0/m, z4.s
-; VBITS_GE_256-NEXT: ucvtf z6.s, p0/m, z6.s
-; VBITS_GE_256-NEXT: ucvtf z7.s, p0/m, z7.s
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v64i32_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @ucvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v64i32_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%res = uitofp <64 x i32> %op1 to <64 x float>
store <64 x float> %res, <64 x float>* %b
@@ -884,7 +527,7 @@ define void @ucvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x double> @ucvtf_v1i32_v1f64(<1 x i32> %op1) #0 {
+define <1 x double> @ucvtf_v1i32_v1f64(<1 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v1i32_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
@@ -896,7 +539,7 @@ define <1 x double> @ucvtf_v1i32_v1f64(<1 x i32> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) #0 {
+define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v2i32_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
@@ -906,7 +549,7 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) #0 {
ret <2 x double> %res
}
-define void @ucvtf_v4i32_v4f64(<4 x i32>* %a, <4 x double>* %b) #0 {
+define void @ucvtf_v4i32_v4f64(<4 x i32>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v4i32_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -952,102 +595,32 @@ define void @ucvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @ucvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: ucvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: uunpklo z2.d, z1.s
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: ucvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v16i32_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @ucvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v16i32_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%res = uitofp <16 x i32> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b
ret void
}
-define void @ucvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i32_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x11, #12
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: uunpklo z4.d, z0.s
-; VBITS_GE_256-NEXT: uunpklo z5.d, z1.s
-; VBITS_GE_256-NEXT: uunpklo z6.d, z2.s
-; VBITS_GE_256-NEXT: ucvtf z4.d, p0/m, z4.d
-; VBITS_GE_256-NEXT: ucvtf z5.d, p0/m, z5.d
-; VBITS_GE_256-NEXT: ucvtf z6.d, p0/m, z6.d
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: uunpklo z7.d, z3.s
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: mov x10, #20
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: ucvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: ucvtf z7.d, p0/m, z7.d
-; VBITS_GE_256-NEXT: ucvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v32i32_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @ucvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v32i32_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%res = uitofp <32 x i32> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b
@@ -1059,7 +632,7 @@ define void @ucvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x half> @ucvtf_v1i64_v1f16(<1 x i64> %op1) #0 {
+define <1 x half> @ucvtf_v1i64_v1f16(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v1i64_v1f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -1071,7 +644,7 @@ define <1 x half> @ucvtf_v1i64_v1f16(<1 x i64> %op1) #0 {
}
; v2f16 is not legal for NEON, so use SVE
-define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) #0 {
+define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v2i64_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -1085,7 +658,7 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) #0 {
ret <2 x half> %res
}
-define <4 x half> @ucvtf_v4i64_v4f16(<4 x i64>* %a) #0 {
+define <4 x half> @ucvtf_v4i64_v4f16(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v4i64_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -1134,126 +707,37 @@ define <8 x half> @ucvtf_v8i64_v8f16(<8 x i64>* %a) #0 {
ret <8 x half> %res
}
-define void @ucvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v16i64_v16f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d
-; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.d
-; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.d
-; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z2.d
-; VBITS_GE_256-NEXT: ucvtf z3.h, p0/m, z3.d
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0]
-; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: ptrue p0.h, vl8
-; VBITS_GE_256-NEXT: mov v2.d[1], v1.d[0]
-; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v16i64_v16f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d
-; VBITS_GE_1024-NEXT: ucvtf z0.h, p0/m, z0.d
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @ucvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v16i64_v16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%res = uitofp <16 x i64> %op1 to <16 x half>
store <16 x half> %res, <16 x half>* %b
ret void
}
-define void @ucvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i64_v32f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: mov x11, #28
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x12, #16
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x13, #20
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d
-; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z2.d
-; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.d
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: mov v1.d[1], v2.d[0]
-; VBITS_GE_256-NEXT: movprfx z2, z6
-; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z6.d
-; VBITS_GE_256-NEXT: ucvtf z5.h, p0/m, z5.d
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z5.h
-; VBITS_GE_256-NEXT: ucvtf z3.h, p0/m, z3.d
-; VBITS_GE_256-NEXT: mov v5.d[1], v2.d[0]
-; VBITS_GE_256-NEXT: movprfx z2, z4
-; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z4.d
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0]
-; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.d
-; VBITS_GE_256-NEXT: movprfx z2, z7
-; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z7.d
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: ptrue p0.h, vl8
-; VBITS_GE_256-NEXT: mov v2.d[1], v0.d[0]
-; VBITS_GE_256-NEXT: splice z5.h, p0, z5.h, z1.h
-; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z3.h
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d
-; VBITS_GE_2048-NEXT: ucvtf z0.h, p0/m, z0.d
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
- %op1 = load <32 x i64>, <32 x i64>* %a
+define void @ucvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v32i64_v32f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %op1 = load <32 x i64>, <32 x i64>* %a
%res = uitofp <32 x i64> %op1 to <32 x half>
store <32 x half> %res, <32 x half>* %b
ret void
@@ -1264,7 +748,7 @@ define void @ucvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) #0 {
+define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v1i64_v1f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -1276,7 +760,7 @@ define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) #0 {
+define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v2i64_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ucvtf v0.2d, v0.2d
@@ -1286,7 +770,7 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) #0 {
ret <2 x float> %res
}
-define <4 x float> @ucvtf_v4i64_v4f32(<4 x i64>* %a) #0 {
+define <4 x float> @ucvtf_v4i64_v4f32(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v4i64_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -1335,110 +819,34 @@ define void @ucvtf_v8i64_v8f32(<8 x i64>* %a, <8 x float>* %b) #0 {
ret void
}
-define void @ucvtf_v16i64_v16f32(<16 x i64>* %a, <16 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v16i64_v16f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ptrue p1.d
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.s, vl4
-; VBITS_GE_256-NEXT: ucvtf z0.s, p1/m, z0.d
-; VBITS_GE_256-NEXT: ucvtf z2.s, p1/m, z2.d
-; VBITS_GE_256-NEXT: ucvtf z1.s, p1/m, z1.d
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: ucvtf z3.s, p1/m, z3.d
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: splice z2.s, p0, z2.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: splice z0.s, p0, z0.s, z1.s
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v16i64_v16f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d
-; VBITS_GE_1024-NEXT: ucvtf z0.s, p0/m, z0.d
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @ucvtf_v16i64_v16f32(<16 x i64>* %a, <16 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v16i64_v16f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%res = uitofp <16 x i64> %op1 to <16 x float>
store <16 x float> %res, <16 x float>* %b
ret void
}
-define void @ucvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i64_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x11, #8
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x12, #24
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #20
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: ptrue p1.d
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ucvtf z1.s, p1/m, z1.d
-; VBITS_GE_256-NEXT: ucvtf z2.s, p1/m, z2.d
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: ptrue p2.s, vl4
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: splice z2.s, p2, z2.s, z1.s
-; VBITS_GE_256-NEXT: movprfx z1, z6
-; VBITS_GE_256-NEXT: ucvtf z1.s, p1/m, z6.d
-; VBITS_GE_256-NEXT: ucvtf z5.s, p1/m, z5.d
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT: ucvtf z3.s, p1/m, z3.d
-; VBITS_GE_256-NEXT: ucvtf z4.s, p1/m, z4.d
-; VBITS_GE_256-NEXT: splice z5.s, p2, z5.s, z1.s
-; VBITS_GE_256-NEXT: ucvtf z0.s, p1/m, z0.d
-; VBITS_GE_256-NEXT: movprfx z1, z7
-; VBITS_GE_256-NEXT: ucvtf z1.s, p1/m, z7.d
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: splice z4.s, p2, z4.s, z3.s
-; VBITS_GE_256-NEXT: splice z1.s, p2, z1.s, z0.s
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d
-; VBITS_GE_2048-NEXT: ucvtf z0.s, p0/m, z0.d
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @ucvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v32i64_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%res = uitofp <32 x i64> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b
@@ -1450,7 +858,7 @@ define void @ucvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) #0 {
+define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v1i64_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -1462,7 +870,7 @@ define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) #0 {
+define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v2i64_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ucvtf v0.2d, v0.2d
@@ -1471,7 +879,7 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) #0 {
ret <2 x double> %res
}
-define void @ucvtf_v4i64_v4f64(<4 x i64>* %a, <4 x double>* %b) #0 {
+define void @ucvtf_v4i64_v4f64(<4 x i64>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v4i64_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -1511,84 +919,28 @@ define void @ucvtf_v8i64_v8f64(<8 x i64>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @ucvtf_v16i64_v16f64(<16 x i64>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v16i64_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: ucvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT: ucvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v16i64_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @ucvtf_v16i64_v16f64(<16 x i64>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v16i64_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%res = uitofp <16 x i64> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b
ret void
}
-define void @ucvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i64_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: mov x12, #8
-; VBITS_GE_256-NEXT: mov x13, #28
-; VBITS_GE_256-NEXT: mov x14, #16
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: ucvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT: ucvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT: ucvtf z5.d, p0/m, z5.d
-; VBITS_GE_256-NEXT: ucvtf z4.d, p0/m, z4.d
-; VBITS_GE_256-NEXT: ucvtf z6.d, p0/m, z6.d
-; VBITS_GE_256-NEXT: ucvtf z7.d, p0/m, z7.d
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @ucvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v32i64_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%res = uitofp <32 x i64> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b
@@ -1600,7 +952,7 @@ define void @ucvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) #0 {
+define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v4i16_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: scvtf v0.4h, v0.4h
@@ -1610,7 +962,7 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define void @scvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) #0 {
+define void @scvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v8i16_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -1623,7 +975,7 @@ define void @scvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) #0 {
ret void
}
-define void @scvtf_v16i16_v16f16(<16 x i16>* %a, <16 x half>* %b) #0 {
+define void @scvtf_v16i16_v16f16(<16 x i16>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v16i16_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -1663,84 +1015,28 @@ define void @scvtf_v32i16_v32f16(<32 x i16>* %a, <32 x half>* %b) #0 {
ret void
}
-define void @scvtf_v64i16_v64f16(<64 x i16>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v64i16_v64f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #48
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #32
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.h
-; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.h
-; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z2.h
-; VBITS_GE_256-NEXT: scvtf z3.h, p0/m, z3.h
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v64i16_v64f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: scvtf z0.h, p0/m, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @scvtf_v64i16_v64f16(<64 x i16>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v64i16_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%res = sitofp <64 x i16> %op1 to <64 x half>
store <64 x half> %res, <64 x half>* %b
ret void
}
-define void @scvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v128i16_v128f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #96
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x11, #80
-; VBITS_GE_256-NEXT: mov x12, #32
-; VBITS_GE_256-NEXT: mov x13, #112
-; VBITS_GE_256-NEXT: mov x14, #64
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.h
-; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.h
-; VBITS_GE_256-NEXT: scvtf z3.h, p0/m, z3.h
-; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z2.h
-; VBITS_GE_256-NEXT: scvtf z5.h, p0/m, z5.h
-; VBITS_GE_256-NEXT: scvtf z4.h, p0/m, z4.h
-; VBITS_GE_256-NEXT: scvtf z6.h, p0/m, z6.h
-; VBITS_GE_256-NEXT: scvtf z7.h, p0/m, z7.h
-; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v128i16_v128f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: scvtf z0.h, p0/m, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @scvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v128i16_v128f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%res = sitofp <128 x i16> %op1 to <128 x half>
store <128 x half> %res, <128 x half>* %b
@@ -1752,7 +1048,7 @@ define void @scvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) #0 {
+define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v2i16_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v0.2s, v0.2s, #16
@@ -1764,7 +1060,7 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) #0 {
+define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v4i16_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
@@ -1774,7 +1070,7 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) #0 {
ret <4 x float> %res
}
-define void @scvtf_v8i16_v8f32(<8 x i16>* %a, <8 x float>* %b) #0 {
+define void @scvtf_v8i16_v8f32(<8 x i16>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v8i16_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -1820,102 +1116,32 @@ define void @scvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @scvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i16_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: sunpklo z2.s, z0.h
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: scvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: scvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v32i16_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: scvtf z0.s, p0/m, z0.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @scvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v32i16_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%res = sitofp <32 x i16> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b
ret void
}
-define void @scvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v64i16_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #32
-; VBITS_GE_256-NEXT: mov x10, #48
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x11, #24
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: sunpklo z4.s, z0.h
-; VBITS_GE_256-NEXT: sunpklo z5.s, z1.h
-; VBITS_GE_256-NEXT: sunpklo z6.s, z2.h
-; VBITS_GE_256-NEXT: scvtf z4.s, p0/m, z4.s
-; VBITS_GE_256-NEXT: scvtf z5.s, p0/m, z5.s
-; VBITS_GE_256-NEXT: scvtf z6.s, p0/m, z6.s
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: sunpklo z7.s, z3.h
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: mov x9, #56
-; VBITS_GE_256-NEXT: mov x10, #40
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT: scvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT: scvtf z7.s, p0/m, z7.s
-; VBITS_GE_256-NEXT: scvtf z3.s, p0/m, z3.s
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v64i16_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: scvtf z0.s, p0/m, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @scvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v64i16_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%res = sitofp <64 x i16> %op1 to <64 x float>
store <64 x float> %res, <64 x float>* %b
@@ -1927,7 +1153,7 @@ define void @scvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) #0 {
;
; v1i16 is perfered to be widened to v4i16, which pushes the output into SVE types, so use SVE
-define <1 x double> @scvtf_v1i16_v1f64(<1 x i16> %op1) #0 {
+define <1 x double> @scvtf_v1i16_v1f64(<1 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v1i16_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -1942,7 +1168,7 @@ define <1 x double> @scvtf_v1i16_v1f64(<1 x i16> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) #0 {
+define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v2i16_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v0.2s, v0.2s, #16
@@ -1954,7 +1180,7 @@ define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) #0 {
ret <2 x double> %res
}
-define void @scvtf_v4i16_v4f64(<4 x i16>* %a, <4 x double>* %b) #0 {
+define void @scvtf_v4i16_v4f64(<4 x i16>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v4i16_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -2002,119 +1228,34 @@ define void @scvtf_v8i16_v8f64(<8 x i16>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @scvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v16i16_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: mov z1.d, z0.d
-; VBITS_GE_256-NEXT: sunpklo z2.s, z0.h
-; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ext v3.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: sunpklo z1.s, z3.h
-; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: sunpklo z0.d, z2.s
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v16i16_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @scvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v16i16_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%res = sitofp <16 x i16> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b
ret void
}
-define void @scvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i16_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x9, #20
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: scvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT: sunpklo z4.s, z1.h
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: scvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT: sunpklo z6.s, z6.h
-; VBITS_GE_256-NEXT: movprfx z0, z5
-; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z5.d
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #28
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: sunpklo z1.d, z6.s
-; VBITS_GE_256-NEXT: scvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT: movprfx z0, z4
-; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z4.d
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v32i16_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @scvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v32i16_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%res = sitofp <32 x i16> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b
@@ -2126,7 +1267,7 @@ define void @scvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) #0 {
+define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v2i32_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -2138,7 +1279,7 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) #0 {
+define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v4i32_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: scvtf v0.4s, v0.4s
@@ -2148,7 +1289,7 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) #0 {
ret <4 x half> %res
}
-define <8 x half> @scvtf_v8i32_v8f16(<8 x i32>* %a) #0 {
+define <8 x half> @scvtf_v8i32_v8f16(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v8i32_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -2197,110 +1338,34 @@ define void @scvtf_v16i32_v16f16(<16 x i32>* %a, <16 x half>* %b) #0 {
ret void
}
-define void @scvtf_v32i32_v32f16(<32 x i32>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i32_v32f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ptrue p1.s
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.h, vl8
-; VBITS_GE_256-NEXT: scvtf z0.h, p1/m, z0.s
-; VBITS_GE_256-NEXT: scvtf z2.h, p1/m, z2.s
-; VBITS_GE_256-NEXT: scvtf z1.h, p1/m, z1.s
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: scvtf z3.h, p1/m, z3.s
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: splice z0.h, p0, z0.h, z1.h
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v32i32_v32f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.s
-; VBITS_GE_1024-NEXT: scvtf z0.h, p0/m, z0.s
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @scvtf_v32i32_v32f16(<32 x i32>* %a, <32 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v32i32_v32f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%res = sitofp <32 x i32> %op1 to <32 x half>
store <32 x half> %res, <32 x half>* %b
ret void
}
-define void @scvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v64i32_v64f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x11, #16
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x12, #48
-; VBITS_GE_256-NEXT: mov x10, #32
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #40
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: mov x9, #56
-; VBITS_GE_256-NEXT: ptrue p1.s
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: scvtf z1.h, p1/m, z1.s
-; VBITS_GE_256-NEXT: scvtf z2.h, p1/m, z2.s
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: ptrue p2.h, vl8
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: splice z2.h, p2, z2.h, z1.h
-; VBITS_GE_256-NEXT: movprfx z1, z6
-; VBITS_GE_256-NEXT: scvtf z1.h, p1/m, z6.s
-; VBITS_GE_256-NEXT: scvtf z5.h, p1/m, z5.s
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z5.h
-; VBITS_GE_256-NEXT: scvtf z3.h, p1/m, z3.s
-; VBITS_GE_256-NEXT: scvtf z4.h, p1/m, z4.s
-; VBITS_GE_256-NEXT: splice z5.h, p2, z5.h, z1.h
-; VBITS_GE_256-NEXT: scvtf z0.h, p1/m, z0.s
-; VBITS_GE_256-NEXT: movprfx z1, z7
-; VBITS_GE_256-NEXT: scvtf z1.h, p1/m, z7.s
-; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: splice z4.h, p2, z4.h, z3.h
-; VBITS_GE_256-NEXT: splice z1.h, p2, z1.h, z0.h
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v64i32_v64f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.s
-; VBITS_GE_2048-NEXT: scvtf z0.h, p0/m, z0.s
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @scvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v64i32_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%res = sitofp <64 x i32> %op1 to <64 x half>
store <64 x half> %res, <64 x half>* %b
@@ -2312,7 +1377,7 @@ define void @scvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) #0 {
+define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v2i32_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: scvtf v0.2s, v0.2s
@@ -2322,7 +1387,7 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) #0 {
+define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v4i32_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: scvtf v0.4s, v0.4s
@@ -2331,7 +1396,7 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) #0 {
ret <4 x float> %res
}
-define void @scvtf_v8i32_v8f32(<8 x i32>* %a, <8 x float>* %b) #0 {
+define void @scvtf_v8i32_v8f32(<8 x i32>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v8i32_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -2371,84 +1436,28 @@ define void @scvtf_v16i32_v16f32(<16 x i32>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @scvtf_v32i32_v32f32(<32 x i32>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i32_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT: scvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT: scvtf z3.s, p0/m, z3.s
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v32i32_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: scvtf z0.s, p0/m, z0.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @scvtf_v32i32_v32f32(<32 x i32>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v32i32_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%res = sitofp <32 x i32> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b
ret void
}
-define void @scvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v64i32_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #48
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: mov x11, #40
-; VBITS_GE_256-NEXT: mov x12, #16
-; VBITS_GE_256-NEXT: mov x13, #56
-; VBITS_GE_256-NEXT: mov x14, #32
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT: scvtf z3.s, p0/m, z3.s
-; VBITS_GE_256-NEXT: scvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT: scvtf z5.s, p0/m, z5.s
-; VBITS_GE_256-NEXT: scvtf z4.s, p0/m, z4.s
-; VBITS_GE_256-NEXT: scvtf z6.s, p0/m, z6.s
-; VBITS_GE_256-NEXT: scvtf z7.s, p0/m, z7.s
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v64i32_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: scvtf z0.s, p0/m, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @scvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v64i32_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%res = sitofp <64 x i32> %op1 to <64 x float>
store <64 x float> %res, <64 x float>* %b
@@ -2460,7 +1469,7 @@ define void @scvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x double> @scvtf_v1i32_v1f64(<1 x i32> %op1) #0 {
+define <1 x double> @scvtf_v1i32_v1f64(<1 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v1i32_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
@@ -2472,7 +1481,7 @@ define <1 x double> @scvtf_v1i32_v1f64(<1 x i32> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) #0 {
+define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v2i32_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
@@ -2482,7 +1491,7 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) #0 {
ret <2 x double> %res
}
-define void @scvtf_v4i32_v4f64(<4 x i32>* %a, <4 x double>* %b) #0 {
+define void @scvtf_v4i32_v4f64(<4 x i32>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v4i32_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -2528,102 +1537,32 @@ define void @scvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @scvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v16i32_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: sunpklo z2.d, z0.s
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: scvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: sunpklo z2.d, z1.s
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: scvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v16i32_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @scvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v16i32_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%res = sitofp <16 x i32> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b
ret void
}
-define void @scvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i32_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x11, #12
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: sunpklo z4.d, z0.s
-; VBITS_GE_256-NEXT: sunpklo z5.d, z1.s
-; VBITS_GE_256-NEXT: sunpklo z6.d, z2.s
-; VBITS_GE_256-NEXT: scvtf z4.d, p0/m, z4.d
-; VBITS_GE_256-NEXT: scvtf z5.d, p0/m, z5.d
-; VBITS_GE_256-NEXT: scvtf z6.d, p0/m, z6.d
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: sunpklo z7.d, z3.s
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: mov x10, #20
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: scvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: scvtf z7.d, p0/m, z7.d
-; VBITS_GE_256-NEXT: scvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v32i32_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @scvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v32i32_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%res = sitofp <32 x i32> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b
@@ -2635,7 +1574,7 @@ define void @scvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x half> @scvtf_v1i64_v1f16(<1 x i64> %op1) #0 {
+define <1 x half> @scvtf_v1i64_v1f16(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v1i64_v1f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -2647,7 +1586,7 @@ define <1 x half> @scvtf_v1i64_v1f16(<1 x i64> %op1) #0 {
}
; v2f16 is not legal for NEON, so use SVE
-define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) #0 {
+define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v2i64_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -2661,7 +1600,7 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) #0 {
ret <2 x half> %res
}
-define <4 x half> @scvtf_v4i64_v4f16(<4 x i64>* %a) #0 {
+define <4 x half> @scvtf_v4i64_v4f16(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v4i64_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -2710,125 +1649,36 @@ define <8 x half> @scvtf_v8i64_v8f16(<8 x i64>* %a) #0 {
ret <8 x half> %res
}
-define void @scvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v16i64_v16f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d
-; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.d
-; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.d
-; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z2.d
-; VBITS_GE_256-NEXT: scvtf z3.h, p0/m, z3.d
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0]
-; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: ptrue p0.h, vl8
-; VBITS_GE_256-NEXT: mov v2.d[1], v1.d[0]
-; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v16i64_v16f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d
-; VBITS_GE_1024-NEXT: scvtf z0.h, p0/m, z0.d
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @scvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v16i64_v16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%res = sitofp <16 x i64> %op1 to <16 x half>
store <16 x half> %res, <16 x half>* %b
ret void
}
-define void @scvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i64_v32f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: mov x11, #28
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x12, #16
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x13, #20
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.d
-; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z2.d
-; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.d
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: mov v1.d[1], v2.d[0]
-; VBITS_GE_256-NEXT: movprfx z2, z6
-; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z6.d
-; VBITS_GE_256-NEXT: scvtf z5.h, p0/m, z5.d
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z5.h
-; VBITS_GE_256-NEXT: scvtf z3.h, p0/m, z3.d
-; VBITS_GE_256-NEXT: mov v5.d[1], v2.d[0]
-; VBITS_GE_256-NEXT: movprfx z2, z4
-; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z4.d
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0]
-; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.d
-; VBITS_GE_256-NEXT: movprfx z2, z7
-; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z7.d
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: ptrue p0.h, vl8
-; VBITS_GE_256-NEXT: mov v2.d[1], v0.d[0]
-; VBITS_GE_256-NEXT: splice z5.h, p0, z5.h, z1.h
-; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z3.h
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v32i64_v32f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d
-; VBITS_GE_2048-NEXT: scvtf z0.h, p0/m, z0.d
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @scvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v32i64_v32f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%res = sitofp <32 x i64> %op1 to <32 x half>
store <32 x half> %res, <32 x half>* %b
@@ -2840,7 +1690,7 @@ define void @scvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) #0 {
+define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v1i64_v1f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -2852,7 +1702,7 @@ define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) #0 {
+define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v2i64_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: scvtf v0.2d, v0.2d
@@ -2862,7 +1712,7 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) #0 {
ret <2 x float> %res
}
-define <4 x float> @scvtf_v4i64_v4f32(<4 x i64>* %a) #0 {
+define <4 x float> @scvtf_v4i64_v4f32(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v4i64_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -2911,110 +1761,34 @@ define void @scvtf_v8i64_v8f32(<8 x i64>* %a, <8 x float>* %b) #0 {
ret void
}
-define void @scvtf_v16i64_v16f32(<16 x i64>* %a, <16 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v16i64_v16f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ptrue p1.d
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ptrue p0.s, vl4
-; VBITS_GE_256-NEXT: scvtf z0.s, p1/m, z0.d
-; VBITS_GE_256-NEXT: scvtf z2.s, p1/m, z2.d
-; VBITS_GE_256-NEXT: scvtf z1.s, p1/m, z1.d
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: scvtf z3.s, p1/m, z3.d
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: splice z2.s, p0, z2.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: splice z0.s, p0, z0.s, z1.s
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v16i64_v16f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d
-; VBITS_GE_1024-NEXT: scvtf z0.s, p0/m, z0.d
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @scvtf_v16i64_v16f32(<16 x i64>* %a, <16 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v16i64_v16f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%res = sitofp <16 x i64> %op1 to <16 x float>
store <16 x float> %res, <16 x float>* %b
ret void
}
-define void @scvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i64_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x11, #8
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x12, #24
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #20
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: ptrue p1.d
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: scvtf z1.s, p1/m, z1.d
-; VBITS_GE_256-NEXT: scvtf z2.s, p1/m, z2.d
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: ptrue p2.s, vl4
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: splice z2.s, p2, z2.s, z1.s
-; VBITS_GE_256-NEXT: movprfx z1, z6
-; VBITS_GE_256-NEXT: scvtf z1.s, p1/m, z6.d
-; VBITS_GE_256-NEXT: scvtf z5.s, p1/m, z5.d
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT: scvtf z3.s, p1/m, z3.d
-; VBITS_GE_256-NEXT: scvtf z4.s, p1/m, z4.d
-; VBITS_GE_256-NEXT: splice z5.s, p2, z5.s, z1.s
-; VBITS_GE_256-NEXT: scvtf z0.s, p1/m, z0.d
-; VBITS_GE_256-NEXT: movprfx z1, z7
-; VBITS_GE_256-NEXT: scvtf z1.s, p1/m, z7.d
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: splice z4.s, p2, z4.s, z3.s
-; VBITS_GE_256-NEXT: splice z1.s, p2, z1.s, z0.s
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v32i64_v32f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d
-; VBITS_GE_2048-NEXT: scvtf z0.s, p0/m, z0.d
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @scvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v32i64_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%res = sitofp <32 x i64> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b
@@ -3026,7 +1800,7 @@ define void @scvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) #0 {
+define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v1i64_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -3038,7 +1812,7 @@ define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) #0 {
+define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v2i64_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: scvtf v0.2d, v0.2d
@@ -3047,7 +1821,7 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) #0 {
ret <2 x double> %res
}
-define void @scvtf_v4i64_v4f64(<4 x i64>* %a, <4 x double>* %b) #0 {
+define void @scvtf_v4i64_v4f64(<4 x i64>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v4i64_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -3087,84 +1861,28 @@ define void @scvtf_v8i64_v8f64(<8 x i64>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @scvtf_v16i64_v16f64(<16 x i64>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v16i64_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: scvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT: scvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v16i64_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @scvtf_v16i64_v16f64(<16 x i64>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v16i64_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%res = sitofp <16 x i64> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b
ret void
}
-define void @scvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i64_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: mov x12, #8
-; VBITS_GE_256-NEXT: mov x13, #28
-; VBITS_GE_256-NEXT: mov x14, #16
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT: scvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT: scvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT: scvtf z5.d, p0/m, z5.d
-; VBITS_GE_256-NEXT: scvtf z4.d, p0/m, z4.d
-; VBITS_GE_256-NEXT: scvtf z6.d, p0/m, z6.d
-; VBITS_GE_256-NEXT: scvtf z7.d, p0/m, z7.d
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v32i64_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: scvtf z0.d, p0/m, z0.d
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @scvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v32i64_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%res = sitofp <32 x i64> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
index 86e5092b71af..a5b725a8fa1e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
@@ -1,26 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) #0 {
+define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.8b, v2.8b, #7
@@ -32,7 +18,7 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) #0 {
+define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.16b, v2.16b, #7
@@ -43,1116 +29,96 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
ret <16 x i8> %sel
}
-define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, <32 x i1>* %c) #0 {
+define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: .cfi_def_cfa w29, 16
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: sub x9, sp, #48
-; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT: ldr w8, [x2]
; CHECK-NEXT: ptrue p0.b, vl32
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: asr w9, w8, #31
-; CHECK-NEXT: sbfx w10, w8, #30, #1
-; CHECK-NEXT: sbfx w11, w8, #29, #1
-; CHECK-NEXT: strb w9, [sp, #31]
-; CHECK-NEXT: sbfx w9, w8, #28, #1
-; CHECK-NEXT: strb w10, [sp, #30]
-; CHECK-NEXT: sbfx w10, w8, #27, #1
-; CHECK-NEXT: strb w11, [sp, #29]
-; CHECK-NEXT: sbfx w11, w8, #26, #1
-; CHECK-NEXT: strb w9, [sp, #28]
-; CHECK-NEXT: sbfx w9, w8, #25, #1
-; CHECK-NEXT: strb w10, [sp, #27]
-; CHECK-NEXT: sbfx w10, w8, #24, #1
-; CHECK-NEXT: strb w11, [sp, #26]
-; CHECK-NEXT: sbfx w11, w8, #23, #1
-; CHECK-NEXT: strb w9, [sp, #25]
-; CHECK-NEXT: sbfx w9, w8, #22, #1
-; CHECK-NEXT: strb w10, [sp, #24]
-; CHECK-NEXT: sbfx w10, w8, #21, #1
-; CHECK-NEXT: strb w11, [sp, #23]
-; CHECK-NEXT: sbfx w11, w8, #20, #1
-; CHECK-NEXT: strb w9, [sp, #22]
-; CHECK-NEXT: sbfx w9, w8, #19, #1
-; CHECK-NEXT: strb w10, [sp, #21]
-; CHECK-NEXT: sbfx w10, w8, #18, #1
-; CHECK-NEXT: strb w11, [sp, #20]
-; CHECK-NEXT: sbfx w11, w8, #17, #1
-; CHECK-NEXT: strb w9, [sp, #19]
-; CHECK-NEXT: sbfx w9, w8, #16, #1
-; CHECK-NEXT: strb w10, [sp, #18]
-; CHECK-NEXT: sbfx w10, w8, #15, #1
-; CHECK-NEXT: strb w11, [sp, #17]
-; CHECK-NEXT: sbfx w11, w8, #14, #1
-; CHECK-NEXT: strb w9, [sp, #16]
-; CHECK-NEXT: sbfx w9, w8, #13, #1
-; CHECK-NEXT: strb w10, [sp, #15]
-; CHECK-NEXT: sbfx w10, w8, #12, #1
-; CHECK-NEXT: strb w11, [sp, #14]
-; CHECK-NEXT: sbfx w11, w8, #11, #1
-; CHECK-NEXT: strb w9, [sp, #13]
-; CHECK-NEXT: sbfx w9, w8, #10, #1
-; CHECK-NEXT: strb w10, [sp, #12]
-; CHECK-NEXT: sbfx w10, w8, #9, #1
-; CHECK-NEXT: strb w11, [sp, #11]
-; CHECK-NEXT: sbfx w11, w8, #8, #1
-; CHECK-NEXT: strb w9, [sp, #10]
-; CHECK-NEXT: sbfx w9, w8, #7, #1
-; CHECK-NEXT: strb w10, [sp, #9]
-; CHECK-NEXT: sbfx w10, w8, #6, #1
-; CHECK-NEXT: strb w11, [sp, #8]
-; CHECK-NEXT: sbfx w11, w8, #5, #1
-; CHECK-NEXT: strb w9, [sp, #7]
-; CHECK-NEXT: sbfx w9, w8, #4, #1
-; CHECK-NEXT: strb w10, [sp, #6]
-; CHECK-NEXT: sbfx w10, w8, #3, #1
-; CHECK-NEXT: strb w11, [sp, #5]
-; CHECK-NEXT: sbfx w11, w8, #2, #1
-; CHECK-NEXT: strb w9, [sp, #4]
-; CHECK-NEXT: sbfx w9, w8, #1, #1
-; CHECK-NEXT: sbfx w8, w8, #0, #1
-; CHECK-NEXT: strb w10, [sp, #3]
-; CHECK-NEXT: strb w11, [sp, #2]
-; CHECK-NEXT: strb w9, [sp, #1]
-; CHECK-NEXT: strb w8, [sp]
-; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp]
-; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0]
-; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1]
-; CHECK-NEXT: and z0.b, z0.b, #0x1
-; CHECK-NEXT: cmpne p1.b, p1/z, z0.b, #0
-; CHECK-NEXT: sel z0.b, p1, z1.b, z2.b
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: .cfi_def_cfa wsp, 16
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: .cfi_restore w30
-; CHECK-NEXT: .cfi_restore w29
; CHECK-NEXT: ret
- %mask = load <32 x i1>, <32 x i1>* %c
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
+ %mask = icmp eq <32 x i8> %op1, %op2
%sel = select <32 x i1> %mask, <32 x i8> %op1, <32 x i8> %op2
store <32 x i8> %sel, <32 x i8>* %a
ret void
}
-define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, <64 x i1>* %c) #0 {
+define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
+; VBITS_GE_256-LABEL: select_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z2.b
+; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z1.b, z3.b
+; VBITS_GE_256-NEXT: sel z0.b, p1, z0.b, z2.b
+; VBITS_GE_256-NEXT: sel z1.b, p2, z1.b, z3.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: select_v64i8:
; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_512-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_512-NEXT: mov x29, sp
-; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_512-NEXT: .cfi_offset w30, -8
-; VBITS_GE_512-NEXT: .cfi_offset w29, -16
-; VBITS_GE_512-NEXT: sub x9, sp, #112
-; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0
-; VBITS_GE_512-NEXT: ldr x8, [x2]
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
-; VBITS_GE_512-NEXT: ptrue p1.b
-; VBITS_GE_512-NEXT: asr x9, x8, #63
-; VBITS_GE_512-NEXT: sbfx x10, x8, #62, #1
-; VBITS_GE_512-NEXT: sbfx x11, x8, #61, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #63]
-; VBITS_GE_512-NEXT: sbfx x9, x8, #60, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #62]
-; VBITS_GE_512-NEXT: sbfx x10, x8, #59, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #61]
-; VBITS_GE_512-NEXT: sbfx x11, x8, #58, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #60]
-; VBITS_GE_512-NEXT: sbfx x9, x8, #57, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #59]
-; VBITS_GE_512-NEXT: sbfx x10, x8, #56, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #58]
-; VBITS_GE_512-NEXT: sbfx x11, x8, #55, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #57]
-; VBITS_GE_512-NEXT: sbfx x9, x8, #54, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #56]
-; VBITS_GE_512-NEXT: sbfx x10, x8, #53, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #55]
-; VBITS_GE_512-NEXT: sbfx x11, x8, #52, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #54]
-; VBITS_GE_512-NEXT: sbfx x9, x8, #51, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #53]
-; VBITS_GE_512-NEXT: sbfx x10, x8, #50, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #52]
-; VBITS_GE_512-NEXT: sbfx x11, x8, #49, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #51]
-; VBITS_GE_512-NEXT: sbfx x9, x8, #48, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #50]
-; VBITS_GE_512-NEXT: sbfx x10, x8, #47, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #49]
-; VBITS_GE_512-NEXT: sbfx x11, x8, #46, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #48]
-; VBITS_GE_512-NEXT: sbfx x9, x8, #45, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #47]
-; VBITS_GE_512-NEXT: sbfx x10, x8, #44, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #46]
-; VBITS_GE_512-NEXT: sbfx x11, x8, #43, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #45]
-; VBITS_GE_512-NEXT: sbfx x9, x8, #42, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #44]
-; VBITS_GE_512-NEXT: sbfx x10, x8, #41, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #43]
-; VBITS_GE_512-NEXT: sbfx x11, x8, #40, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #42]
-; VBITS_GE_512-NEXT: sbfx x9, x8, #39, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #41]
-; VBITS_GE_512-NEXT: sbfx x10, x8, #38, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #40]
-; VBITS_GE_512-NEXT: sbfx x11, x8, #37, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #39]
-; VBITS_GE_512-NEXT: sbfx x9, x8, #36, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #38]
-; VBITS_GE_512-NEXT: sbfx x10, x8, #35, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #37]
-; VBITS_GE_512-NEXT: sbfx x11, x8, #34, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #36]
-; VBITS_GE_512-NEXT: sbfx x9, x8, #33, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #35]
-; VBITS_GE_512-NEXT: sbfx x10, x8, #32, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #34]
-; VBITS_GE_512-NEXT: asr w11, w8, #31
-; VBITS_GE_512-NEXT: strb w9, [sp, #33]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #30, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #32]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #29, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #31]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #28, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #30]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #27, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #29]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #26, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #28]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #25, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #27]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #24, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #26]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #23, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #25]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #22, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #24]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #21, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #23]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #20, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #22]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #19, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #21]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #18, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #20]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #17, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #19]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #16, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #18]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #17]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #16]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #15]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #12, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #14]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #11, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #13]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #10, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #12]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #9, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #11]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #8, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #10]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #7, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #9]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #6, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #8]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #5, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #7]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #4, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #6]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_512-NEXT: strb w10, [sp, #5]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_512-NEXT: strb w11, [sp, #4]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_512-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_512-NEXT: strb w9, [sp, #3]
-; VBITS_GE_512-NEXT: strb w10, [sp, #2]
-; VBITS_GE_512-NEXT: strb w11, [sp, #1]
-; VBITS_GE_512-NEXT: strb w8, [sp]
-; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [sp]
-; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ld1b { z2.b }, p0/z, [x1]
-; VBITS_GE_512-NEXT: and z0.b, z0.b, #0x1
-; VBITS_GE_512-NEXT: cmpne p1.b, p1/z, z0.b, #0
-; VBITS_GE_512-NEXT: sel z0.b, p1, z1.b, z2.b
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
+; VBITS_GE_512-NEXT: sel z0.b, p1, z0.b, z1.b
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_512-NEXT: mov sp, x29
-; VBITS_GE_512-NEXT: .cfi_def_cfa wsp, 16
-; VBITS_GE_512-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_512-NEXT: .cfi_def_cfa_offset 0
-; VBITS_GE_512-NEXT: .cfi_restore w30
-; VBITS_GE_512-NEXT: .cfi_restore w29
; VBITS_GE_512-NEXT: ret
- %mask = load <64 x i1>, <64 x i1>* %c
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
+ %mask = icmp eq <64 x i8> %op1, %op2
%sel = select <64 x i1> %mask, <64 x i8> %op1, <64 x i8> %op2
store <64 x i8> %sel, <64 x i8>* %a
ret void
}
-define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, <128 x i1>* %c) #0 {
-; VBITS_GE_1024-LABEL: select_v128i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_1024-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_1024-NEXT: mov x29, sp
-; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_1024-NEXT: .cfi_offset w30, -8
-; VBITS_GE_1024-NEXT: .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT: sub x9, sp, #240
-; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80
-; VBITS_GE_1024-NEXT: ldr x8, [x2, #8]
-; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT: ptrue p1.b
-; VBITS_GE_1024-NEXT: asr x9, x8, #63
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #62, #1
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #61, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #127]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #60, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #126]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #59, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #125]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #58, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #124]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #57, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #123]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #56, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #122]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #55, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #121]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #54, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #120]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #53, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #119]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #52, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #118]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #51, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #117]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #50, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #116]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #49, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #115]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #48, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #114]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #47, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #113]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #46, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #112]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #45, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #111]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #44, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #110]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #43, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #109]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #42, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #108]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #41, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #107]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #40, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #106]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #39, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #105]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #38, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #104]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #37, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #103]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #36, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #102]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #35, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #101]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #34, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #100]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #33, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #99]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #32, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #98]
-; VBITS_GE_1024-NEXT: asr w11, w8, #31
-; VBITS_GE_1024-NEXT: strb w9, [sp, #97]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #30, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #96]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #29, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #95]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #28, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #94]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #27, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #93]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #26, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #92]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #25, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #91]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #24, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #90]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #23, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #89]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #22, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #88]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #21, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #87]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #20, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #86]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #19, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #85]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #18, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #84]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #17, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #83]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #16, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #82]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #81]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #80]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #79]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #12, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #78]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #11, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #77]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #10, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #76]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #9, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #75]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #8, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #74]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #7, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #73]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #6, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #72]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #5, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #71]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #4, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #70]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #69]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #68]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_1024-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #67]
-; VBITS_GE_1024-NEXT: strb w10, [sp, #66]
-; VBITS_GE_1024-NEXT: strb w11, [sp, #65]
-; VBITS_GE_1024-NEXT: strb w8, [sp, #64]
-; VBITS_GE_1024-NEXT: ldr x8, [x2]
-; VBITS_GE_1024-NEXT: asr x9, x8, #63
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #62, #1
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #61, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #63]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #60, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #62]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #59, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #61]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #58, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #60]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #57, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #59]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #56, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #58]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #55, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #57]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #54, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #56]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #53, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #55]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #52, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #54]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #51, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #53]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #50, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #52]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #49, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #51]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #48, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #50]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #47, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #49]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #46, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #48]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #45, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #47]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #44, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #46]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #43, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #45]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #42, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #44]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #41, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #43]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #40, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #42]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #39, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #41]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #38, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #40]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #37, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #39]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #36, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #38]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #35, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #37]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #34, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #36]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #33, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #35]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #32, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #34]
-; VBITS_GE_1024-NEXT: asr w11, w8, #31
-; VBITS_GE_1024-NEXT: strb w9, [sp, #33]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #30, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #32]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #29, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #31]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #28, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #30]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #27, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #29]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #26, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #28]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #25, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #27]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #24, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #26]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #23, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #25]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #22, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #24]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #21, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #23]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #20, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #22]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #19, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #21]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #18, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #20]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #17, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #19]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #16, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #18]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #17]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #16]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #15]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #12, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #14]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #11, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #13]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #10, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #12]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #9, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #11]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #8, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #10]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #7, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #9]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #6, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #8]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #5, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #7]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #4, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #6]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_1024-NEXT: strb w10, [sp, #5]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_1024-NEXT: strb w11, [sp, #4]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_1024-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_1024-NEXT: strb w9, [sp, #3]
-; VBITS_GE_1024-NEXT: strb w10, [sp, #2]
-; VBITS_GE_1024-NEXT: strb w11, [sp, #1]
-; VBITS_GE_1024-NEXT: strb w8, [sp]
-; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [sp]
-; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { z2.b }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: and z0.b, z0.b, #0x1
-; VBITS_GE_1024-NEXT: cmpne p1.b, p1/z, z0.b, #0
-; VBITS_GE_1024-NEXT: sel z0.b, p1, z1.b, z2.b
-; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_1024-NEXT: mov sp, x29
-; VBITS_GE_1024-NEXT: .cfi_def_cfa wsp, 16
-; VBITS_GE_1024-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_1024-NEXT: .cfi_def_cfa_offset 0
-; VBITS_GE_1024-NEXT: .cfi_restore w30
-; VBITS_GE_1024-NEXT: .cfi_restore w29
-; VBITS_GE_1024-NEXT: ret
- %mask = load <128 x i1>, <128 x i1>* %c
+define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v128i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
+ %mask = icmp eq <128 x i8> %op1, %op2
%sel = select <128 x i1> %mask, <128 x i8> %op1, <128 x i8> %op2
store <128 x i8> %sel, <128 x i8>* %a
ret void
}
-define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, <256 x i1>* %c) #0 {
-; VBITS_GE_2048-LABEL: select_v256i8:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_2048-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_2048-NEXT: mov x29, sp
-; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_2048-NEXT: .cfi_offset w30, -8
-; VBITS_GE_2048-NEXT: .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT: sub x9, sp, #496
-; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00
-; VBITS_GE_2048-NEXT: ldr x8, [x2, #24]
-; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT: ptrue p1.b
-; VBITS_GE_2048-NEXT: asr x9, x8, #63
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #255]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #60, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #254]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #59, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #253]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #58, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #252]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #57, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #251]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #56, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #250]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #55, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #249]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #54, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #248]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #53, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #247]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #52, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #246]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #245]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #244]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #243]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #48, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #242]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #47, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #241]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #46, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #240]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #45, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #239]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #44, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #238]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #43, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #237]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #42, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #236]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #41, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #235]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #40, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #234]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #233]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #232]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #231]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #36, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #230]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #35, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #229]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #34, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #228]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #33, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #227]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #32, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #226]
-; VBITS_GE_2048-NEXT: asr w11, w8, #31
-; VBITS_GE_2048-NEXT: strb w9, [sp, #225]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #30, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #224]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #29, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #223]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #28, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #222]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #221]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #220]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #219]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #24, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #218]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #23, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #217]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #22, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #216]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #21, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #215]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #20, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #214]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #19, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #213]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #18, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #212]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #17, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #211]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #16, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #210]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #209]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #208]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #207]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #12, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #206]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #11, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #205]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #10, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #204]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #9, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #203]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #8, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #202]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #7, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #201]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #6, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #200]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #5, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #199]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #4, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #198]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #197]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #196]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #195]
-; VBITS_GE_2048-NEXT: strb w10, [sp, #194]
-; VBITS_GE_2048-NEXT: strb w11, [sp, #193]
-; VBITS_GE_2048-NEXT: strb w8, [sp, #192]
-; VBITS_GE_2048-NEXT: ldr x8, [x2, #16]
-; VBITS_GE_2048-NEXT: asr x9, x8, #63
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #191]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #60, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #190]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #59, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #189]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #58, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #188]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #57, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #187]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #56, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #186]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #55, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #185]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #54, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #184]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #53, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #183]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #52, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #182]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #181]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #180]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #179]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #48, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #178]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #47, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #177]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #46, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #176]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #45, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #175]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #44, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #174]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #43, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #173]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #42, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #172]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #41, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #171]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #40, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #170]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #169]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #168]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #167]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #36, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #166]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #35, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #165]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #34, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #164]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #33, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #163]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #32, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #162]
-; VBITS_GE_2048-NEXT: asr w11, w8, #31
-; VBITS_GE_2048-NEXT: strb w9, [sp, #161]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #30, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #160]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #29, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #159]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #28, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #158]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #157]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #156]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #155]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #24, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #154]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #23, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #153]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #22, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #152]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #21, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #151]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #20, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #150]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #19, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #149]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #18, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #148]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #17, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #147]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #16, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #146]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #145]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #144]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #143]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #12, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #142]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #11, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #141]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #10, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #140]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #9, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #139]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #8, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #138]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #7, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #137]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #6, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #136]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #5, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #135]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #4, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #134]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #133]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #132]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #131]
-; VBITS_GE_2048-NEXT: strb w10, [sp, #130]
-; VBITS_GE_2048-NEXT: strb w11, [sp, #129]
-; VBITS_GE_2048-NEXT: strb w8, [sp, #128]
-; VBITS_GE_2048-NEXT: ldr x8, [x2, #8]
-; VBITS_GE_2048-NEXT: asr x9, x8, #63
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #127]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #60, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #126]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #59, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #125]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #58, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #124]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #57, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #123]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #56, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #122]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #55, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #121]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #54, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #120]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #53, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #119]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #52, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #118]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #117]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #116]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #115]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #48, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #114]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #47, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #113]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #46, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #112]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #45, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #111]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #44, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #110]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #43, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #109]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #42, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #108]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #41, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #107]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #40, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #106]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #105]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #104]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #103]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #36, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #102]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #35, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #101]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #34, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #100]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #33, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #99]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #32, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #98]
-; VBITS_GE_2048-NEXT: asr w11, w8, #31
-; VBITS_GE_2048-NEXT: strb w9, [sp, #97]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #30, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #96]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #29, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #95]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #28, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #94]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #93]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #92]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #91]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #24, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #90]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #23, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #89]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #22, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #88]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #21, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #87]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #20, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #86]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #19, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #85]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #18, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #84]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #17, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #83]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #16, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #82]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #81]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #80]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #79]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #12, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #78]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #11, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #77]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #10, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #76]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #9, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #75]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #8, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #74]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #7, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #73]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #6, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #72]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #5, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #71]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #4, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #70]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #69]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #68]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #67]
-; VBITS_GE_2048-NEXT: strb w10, [sp, #66]
-; VBITS_GE_2048-NEXT: strb w11, [sp, #65]
-; VBITS_GE_2048-NEXT: strb w8, [sp, #64]
-; VBITS_GE_2048-NEXT: ldr x8, [x2]
-; VBITS_GE_2048-NEXT: asr x9, x8, #63
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #63]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #60, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #62]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #59, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #61]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #58, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #60]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #57, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #59]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #56, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #58]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #55, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #57]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #54, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #56]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #53, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #55]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #52, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #54]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #53]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #52]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #51]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #48, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #50]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #47, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #49]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #46, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #48]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #45, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #47]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #44, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #46]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #43, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #45]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #42, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #44]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #41, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #43]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #40, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #42]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #41]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #40]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #39]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #36, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #38]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #35, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #37]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #34, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #36]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #33, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #35]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #32, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #34]
-; VBITS_GE_2048-NEXT: asr w11, w8, #31
-; VBITS_GE_2048-NEXT: strb w9, [sp, #33]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #30, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #32]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #29, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #31]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #28, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #30]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #29]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #28]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #27]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #24, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #26]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #23, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #25]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #22, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #24]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #21, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #23]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #20, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #22]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #19, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #21]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #18, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #20]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #17, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #19]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #16, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #18]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #17]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #16]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #15]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #12, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #14]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #11, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #13]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #10, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #12]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #9, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #11]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #8, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #10]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #7, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #9]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #6, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #8]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #5, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #7]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #4, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #6]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT: strb w10, [sp, #5]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT: strb w11, [sp, #4]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT: strb w9, [sp, #3]
-; VBITS_GE_2048-NEXT: strb w10, [sp, #2]
-; VBITS_GE_2048-NEXT: strb w11, [sp, #1]
-; VBITS_GE_2048-NEXT: strb w8, [sp]
-; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [sp]
-; VBITS_GE_2048-NEXT: ld1b { z1.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { z2.b }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: and z0.b, z0.b, #0x1
-; VBITS_GE_2048-NEXT: cmpne p1.b, p1/z, z0.b, #0
-; VBITS_GE_2048-NEXT: sel z0.b, p1, z1.b, z2.b
-; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_2048-NEXT: mov sp, x29
-; VBITS_GE_2048-NEXT: .cfi_def_cfa wsp, 16
-; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_2048-NEXT: .cfi_def_cfa_offset 0
-; VBITS_GE_2048-NEXT: .cfi_restore w30
-; VBITS_GE_2048-NEXT: .cfi_restore w29
-; VBITS_GE_2048-NEXT: ret
- %mask = load <256 x i1>, <256 x i1>* %c
+define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v256i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
+ %mask = icmp eq <256 x i8> %op1, %op2
%sel = select <256 x i1> %mask, <256 x i8> %op1, <256 x i8> %op2
store <256 x i8> %sel, <256 x i8>* %a
ret void
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) #0 {
+define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.4h, v2.4h, #15
@@ -1164,7 +130,7 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) #
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0 {
+define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
@@ -1176,633 +142,96 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #
ret <8 x i16> %sel
}
-define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, <16 x i1>* %c) #0 {
+define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: .cfi_def_cfa w29, 16
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: sub x9, sp, #48
-; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT: ldrh w8, [x2]
; CHECK-NEXT: ptrue p0.h, vl16
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: sbfx w9, w8, #15, #1
-; CHECK-NEXT: sbfx w10, w8, #14, #1
-; CHECK-NEXT: sbfx w11, w8, #13, #1
-; CHECK-NEXT: strh w9, [sp, #30]
-; CHECK-NEXT: sbfx w9, w8, #12, #1
-; CHECK-NEXT: strh w10, [sp, #28]
-; CHECK-NEXT: sbfx w10, w8, #11, #1
-; CHECK-NEXT: strh w11, [sp, #26]
-; CHECK-NEXT: sbfx w11, w8, #10, #1
-; CHECK-NEXT: strh w9, [sp, #24]
-; CHECK-NEXT: sbfx w9, w8, #9, #1
-; CHECK-NEXT: strh w10, [sp, #22]
-; CHECK-NEXT: sbfx w10, w8, #8, #1
-; CHECK-NEXT: strh w11, [sp, #20]
-; CHECK-NEXT: sbfx w11, w8, #7, #1
-; CHECK-NEXT: strh w9, [sp, #18]
-; CHECK-NEXT: sbfx w9, w8, #6, #1
-; CHECK-NEXT: strh w10, [sp, #16]
-; CHECK-NEXT: sbfx w10, w8, #5, #1
-; CHECK-NEXT: strh w11, [sp, #14]
-; CHECK-NEXT: sbfx w11, w8, #4, #1
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: sbfx w9, w8, #3, #1
-; CHECK-NEXT: strh w10, [sp, #10]
-; CHECK-NEXT: sbfx w10, w8, #2, #1
-; CHECK-NEXT: strh w11, [sp, #8]
-; CHECK-NEXT: sbfx w11, w8, #1, #1
-; CHECK-NEXT: sbfx w8, w8, #0, #1
-; CHECK-NEXT: strh w9, [sp, #6]
-; CHECK-NEXT: strh w10, [sp, #4]
-; CHECK-NEXT: strh w11, [sp, #2]
-; CHECK-NEXT: strh w8, [sp]
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
-; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
-; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1]
-; CHECK-NEXT: and z0.h, z0.h, #0x1
-; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0
-; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: .cfi_def_cfa wsp, 16
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: .cfi_restore w30
-; CHECK-NEXT: .cfi_restore w29
; CHECK-NEXT: ret
- %mask = load <16 x i1>, <16 x i1>* %c
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
+ %mask = icmp eq <16 x i16> %op1, %op2
%sel = select <16 x i1> %mask, <16 x i16> %op1, <16 x i16> %op2
store <16 x i16> %sel, <16 x i16>* %a
ret void
}
-define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, <32 x i1>* %c) #0 {
+define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+; VBITS_GE_256-LABEL: select_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h
+; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, z3.h
+; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z2.h
+; VBITS_GE_256-NEXT: sel z1.h, p2, z1.h, z3.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: select_v32i16:
; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_512-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_512-NEXT: mov x29, sp
-; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_512-NEXT: .cfi_offset w30, -8
-; VBITS_GE_512-NEXT: .cfi_offset w29, -16
-; VBITS_GE_512-NEXT: sub x9, sp, #112
-; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0
-; VBITS_GE_512-NEXT: ldr w8, [x2]
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
-; VBITS_GE_512-NEXT: ptrue p1.h
-; VBITS_GE_512-NEXT: asr w9, w8, #31
-; VBITS_GE_512-NEXT: sbfx w10, w8, #30, #1
-; VBITS_GE_512-NEXT: sbfx w11, w8, #29, #1
-; VBITS_GE_512-NEXT: strh w9, [sp, #62]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #28, #1
-; VBITS_GE_512-NEXT: strh w10, [sp, #60]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #27, #1
-; VBITS_GE_512-NEXT: strh w11, [sp, #58]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #26, #1
-; VBITS_GE_512-NEXT: strh w9, [sp, #56]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #25, #1
-; VBITS_GE_512-NEXT: strh w10, [sp, #54]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #24, #1
-; VBITS_GE_512-NEXT: strh w11, [sp, #52]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #23, #1
-; VBITS_GE_512-NEXT: strh w9, [sp, #50]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #22, #1
-; VBITS_GE_512-NEXT: strh w10, [sp, #48]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #21, #1
-; VBITS_GE_512-NEXT: strh w11, [sp, #46]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #20, #1
-; VBITS_GE_512-NEXT: strh w9, [sp, #44]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #19, #1
-; VBITS_GE_512-NEXT: strh w10, [sp, #42]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #18, #1
-; VBITS_GE_512-NEXT: strh w11, [sp, #40]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #17, #1
-; VBITS_GE_512-NEXT: strh w9, [sp, #38]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #16, #1
-; VBITS_GE_512-NEXT: strh w10, [sp, #36]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #15, #1
-; VBITS_GE_512-NEXT: strh w11, [sp, #34]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #14, #1
-; VBITS_GE_512-NEXT: strh w9, [sp, #32]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #13, #1
-; VBITS_GE_512-NEXT: strh w10, [sp, #30]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #12, #1
-; VBITS_GE_512-NEXT: strh w11, [sp, #28]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #11, #1
-; VBITS_GE_512-NEXT: strh w9, [sp, #26]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #10, #1
-; VBITS_GE_512-NEXT: strh w10, [sp, #24]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #9, #1
-; VBITS_GE_512-NEXT: strh w11, [sp, #22]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #8, #1
-; VBITS_GE_512-NEXT: strh w9, [sp, #20]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #7, #1
-; VBITS_GE_512-NEXT: strh w10, [sp, #18]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #6, #1
-; VBITS_GE_512-NEXT: strh w11, [sp, #16]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #5, #1
-; VBITS_GE_512-NEXT: strh w9, [sp, #14]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #4, #1
-; VBITS_GE_512-NEXT: strh w10, [sp, #12]
-; VBITS_GE_512-NEXT: sbfx w10, w8, #3, #1
-; VBITS_GE_512-NEXT: strh w11, [sp, #10]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #2, #1
-; VBITS_GE_512-NEXT: strh w9, [sp, #8]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #1, #1
-; VBITS_GE_512-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_512-NEXT: strh w10, [sp, #6]
-; VBITS_GE_512-NEXT: strh w11, [sp, #4]
-; VBITS_GE_512-NEXT: strh w9, [sp, #2]
-; VBITS_GE_512-NEXT: strh w8, [sp]
-; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [sp]
-; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT: and z0.h, z0.h, #0x1
-; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z0.h, #0
-; VBITS_GE_512-NEXT: sel z0.h, p1, z1.h, z2.h
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
+; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_512-NEXT: mov sp, x29
-; VBITS_GE_512-NEXT: .cfi_def_cfa wsp, 16
-; VBITS_GE_512-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_512-NEXT: .cfi_def_cfa_offset 0
-; VBITS_GE_512-NEXT: .cfi_restore w30
-; VBITS_GE_512-NEXT: .cfi_restore w29
; VBITS_GE_512-NEXT: ret
- %mask = load <32 x i1>, <32 x i1>* %c
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
+ %mask = icmp eq <32 x i16> %op1, %op2
%sel = select <32 x i1> %mask, <32 x i16> %op1, <32 x i16> %op2
store <32 x i16> %sel, <32 x i16>* %a
ret void
}
-define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, <64 x i1>* %c) #0 {
-; VBITS_GE_1024-LABEL: select_v64i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_1024-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_1024-NEXT: mov x29, sp
-; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_1024-NEXT: .cfi_offset w30, -8
-; VBITS_GE_1024-NEXT: .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT: sub x9, sp, #240
-; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80
-; VBITS_GE_1024-NEXT: ldr x8, [x2]
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ptrue p1.h
-; VBITS_GE_1024-NEXT: asr x9, x8, #63
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #62, #1
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #61, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #126]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #60, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #124]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #59, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #122]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #58, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #120]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #57, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #118]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #56, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #116]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #55, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #114]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #54, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #112]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #53, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #110]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #52, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #108]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #51, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #106]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #50, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #104]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #49, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #102]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #48, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #100]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #47, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #98]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #46, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #96]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #45, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #94]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #44, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #92]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #43, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #90]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #42, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #88]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #41, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #86]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #40, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #84]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #39, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #82]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #38, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #80]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #37, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #78]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #36, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #76]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #35, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #74]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #34, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #72]
-; VBITS_GE_1024-NEXT: sbfx x9, x8, #33, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #70]
-; VBITS_GE_1024-NEXT: sbfx x10, x8, #32, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #68]
-; VBITS_GE_1024-NEXT: asr w11, w8, #31
-; VBITS_GE_1024-NEXT: strh w9, [sp, #66]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #30, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #64]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #29, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #62]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #28, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #60]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #27, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #58]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #26, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #56]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #25, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #54]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #24, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #52]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #23, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #50]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #22, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #48]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #21, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #46]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #20, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #44]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #19, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #42]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #18, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #40]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #17, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #38]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #16, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #36]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #34]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #32]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #30]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #12, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #28]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #11, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #26]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #10, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #24]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #9, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #22]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #8, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #20]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #7, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #18]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #6, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #16]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #5, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #14]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #4, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #12]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_1024-NEXT: strh w10, [sp, #10]
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_1024-NEXT: strh w11, [sp, #8]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_1024-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_1024-NEXT: strh w9, [sp, #6]
-; VBITS_GE_1024-NEXT: strh w10, [sp, #4]
-; VBITS_GE_1024-NEXT: strh w11, [sp, #2]
-; VBITS_GE_1024-NEXT: strh w8, [sp]
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [sp]
-; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { z2.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: and z0.h, z0.h, #0x1
-; VBITS_GE_1024-NEXT: cmpne p1.h, p1/z, z0.h, #0
-; VBITS_GE_1024-NEXT: sel z0.h, p1, z1.h, z2.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: mov sp, x29
-; VBITS_GE_1024-NEXT: .cfi_def_cfa wsp, 16
-; VBITS_GE_1024-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_1024-NEXT: .cfi_def_cfa_offset 0
-; VBITS_GE_1024-NEXT: .cfi_restore w30
-; VBITS_GE_1024-NEXT: .cfi_restore w29
-; VBITS_GE_1024-NEXT: ret
- %mask = load <64 x i1>, <64 x i1>* %c
+define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v64i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
+ %mask = icmp eq <64 x i16> %op1, %op2
%sel = select <64 x i1> %mask, <64 x i16> %op1, <64 x i16> %op2
store <64 x i16> %sel, <64 x i16>* %a
ret void
}
-define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, <128 x i1>* %c) #0 {
-; VBITS_GE_2048-LABEL: select_v128i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_2048-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_2048-NEXT: mov x29, sp
-; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_2048-NEXT: .cfi_offset w30, -8
-; VBITS_GE_2048-NEXT: .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT: sub x9, sp, #496
-; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00
-; VBITS_GE_2048-NEXT: ldr x8, [x2, #8]
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ptrue p1.h
-; VBITS_GE_2048-NEXT: asr x9, x8, #63
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #254]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #60, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #252]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #59, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #250]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #58, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #248]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #57, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #246]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #56, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #244]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #55, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #242]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #54, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #240]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #53, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #238]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #52, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #236]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #234]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #232]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #230]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #48, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #228]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #47, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #226]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #46, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #224]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #45, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #222]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #44, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #220]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #43, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #218]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #42, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #216]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #41, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #214]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #40, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #212]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #210]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #208]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #206]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #36, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #204]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #35, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #202]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #34, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #200]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #33, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #198]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #32, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #196]
-; VBITS_GE_2048-NEXT: asr w11, w8, #31
-; VBITS_GE_2048-NEXT: strh w9, [sp, #194]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #30, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #192]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #29, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #190]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #28, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #188]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #186]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #184]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #182]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #24, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #180]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #23, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #178]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #22, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #176]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #21, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #174]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #20, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #172]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #19, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #170]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #18, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #168]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #17, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #166]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #16, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #164]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #162]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #160]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #158]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #12, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #156]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #11, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #154]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #10, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #152]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #9, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #150]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #8, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #148]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #7, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #146]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #6, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #144]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #5, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #142]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #4, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #140]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #138]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #136]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #134]
-; VBITS_GE_2048-NEXT: strh w10, [sp, #132]
-; VBITS_GE_2048-NEXT: strh w11, [sp, #130]
-; VBITS_GE_2048-NEXT: strh w8, [sp, #128]
-; VBITS_GE_2048-NEXT: ldr x8, [x2]
-; VBITS_GE_2048-NEXT: asr x9, x8, #63
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #126]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #60, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #124]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #59, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #122]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #58, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #120]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #57, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #118]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #56, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #116]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #55, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #114]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #54, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #112]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #53, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #110]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #52, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #108]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #106]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #104]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #102]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #48, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #100]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #47, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #98]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #46, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #96]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #45, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #94]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #44, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #92]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #43, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #90]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #42, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #88]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #41, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #86]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #40, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #84]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #82]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #80]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #78]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #36, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #76]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #35, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #74]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #34, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #72]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #33, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #70]
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #32, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #68]
-; VBITS_GE_2048-NEXT: asr w11, w8, #31
-; VBITS_GE_2048-NEXT: strh w9, [sp, #66]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #30, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #64]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #29, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #62]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #28, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #60]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #58]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #56]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #54]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #24, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #52]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #23, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #50]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #22, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #48]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #21, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #46]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #20, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #44]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #19, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #42]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #18, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #40]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #17, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #38]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #16, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #36]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #34]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #32]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #30]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #12, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #28]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #11, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #26]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #10, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #24]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #9, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #22]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #8, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #20]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #7, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #18]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #6, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #16]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #5, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #14]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #4, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #12]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT: strh w10, [sp, #10]
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT: strh w11, [sp, #8]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT: strh w9, [sp, #6]
-; VBITS_GE_2048-NEXT: strh w10, [sp, #4]
-; VBITS_GE_2048-NEXT: strh w11, [sp, #2]
-; VBITS_GE_2048-NEXT: strh w8, [sp]
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [sp]
-; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { z2.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: and z0.h, z0.h, #0x1
-; VBITS_GE_2048-NEXT: cmpne p1.h, p1/z, z0.h, #0
-; VBITS_GE_2048-NEXT: sel z0.h, p1, z1.h, z2.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: mov sp, x29
-; VBITS_GE_2048-NEXT: .cfi_def_cfa wsp, 16
-; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_2048-NEXT: .cfi_def_cfa_offset 0
-; VBITS_GE_2048-NEXT: .cfi_restore w30
-; VBITS_GE_2048-NEXT: .cfi_restore w29
-; VBITS_GE_2048-NEXT: ret
- %mask = load <128 x i1>, <128 x i1>* %c
+define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v128i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
+ %mask = icmp eq <128 x i16> %op1, %op2
%sel = select <128 x i1> %mask, <128 x i16> %op1, <128 x i16> %op2
store <128 x i16> %sel, <128 x i16>* %a
ret void
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) #0 {
+define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.2s, v2.2s, #31
@@ -1814,7 +243,7 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) #
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) #0 {
+define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.4s, v2.4h, #0
@@ -1826,332 +255,96 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) #
ret <4 x i32> %sel
}
-define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, <8 x i1>* %c) #0 {
+define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: .cfi_def_cfa w29, 16
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: sub x9, sp, #48
-; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT: ldrb w8, [x2]
; CHECK-NEXT: ptrue p0.s, vl8
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: sbfx w9, w8, #7, #1
-; CHECK-NEXT: sbfx w10, w8, #6, #1
-; CHECK-NEXT: sbfx w11, w8, #5, #1
-; CHECK-NEXT: sbfx w12, w8, #4, #1
-; CHECK-NEXT: stp w10, w9, [sp, #24]
-; CHECK-NEXT: sbfx w9, w8, #3, #1
-; CHECK-NEXT: sbfx w10, w8, #2, #1
-; CHECK-NEXT: stp w12, w11, [sp, #16]
-; CHECK-NEXT: sbfx w11, w8, #1, #1
-; CHECK-NEXT: sbfx w8, w8, #0, #1
-; CHECK-NEXT: stp w10, w9, [sp, #8]
-; CHECK-NEXT: stp w8, w11, [sp]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
-; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1]
-; CHECK-NEXT: and z0.s, z0.s, #0x1
-; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0
-; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: .cfi_def_cfa wsp, 16
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: .cfi_restore w30
-; CHECK-NEXT: .cfi_restore w29
; CHECK-NEXT: ret
- %mask = load <8 x i1>, <8 x i1>* %c
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
+ %mask = icmp eq <8 x i32> %op1, %op2
%sel = select <8 x i1> %mask, <8 x i32> %op1, <8 x i32> %op2
store <8 x i32> %sel, <8 x i32>* %a
ret void
}
-define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, <16 x i1>* %c) #0 {
+define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
+; VBITS_GE_256-LABEL: select_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s
+; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, z3.s
+; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z2.s
+; VBITS_GE_256-NEXT: sel z1.s, p2, z1.s, z3.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: select_v16i32:
; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_512-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_512-NEXT: mov x29, sp
-; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_512-NEXT: .cfi_offset w30, -8
-; VBITS_GE_512-NEXT: .cfi_offset w29, -16
-; VBITS_GE_512-NEXT: sub x9, sp, #112
-; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0
-; VBITS_GE_512-NEXT: ldrh w8, [x2]
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: ptrue p1.s
-; VBITS_GE_512-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_512-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_512-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_512-NEXT: sbfx w12, w8, #12, #1
-; VBITS_GE_512-NEXT: stp w10, w9, [sp, #56]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #11, #1
-; VBITS_GE_512-NEXT: sbfx w10, w8, #10, #1
-; VBITS_GE_512-NEXT: stp w12, w11, [sp, #48]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #9, #1
-; VBITS_GE_512-NEXT: sbfx w12, w8, #8, #1
-; VBITS_GE_512-NEXT: stp w10, w9, [sp, #40]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #7, #1
-; VBITS_GE_512-NEXT: sbfx w10, w8, #6, #1
-; VBITS_GE_512-NEXT: stp w12, w11, [sp, #32]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #5, #1
-; VBITS_GE_512-NEXT: sbfx w12, w8, #4, #1
-; VBITS_GE_512-NEXT: stp w10, w9, [sp, #24]
-; VBITS_GE_512-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_512-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_512-NEXT: stp w12, w11, [sp, #16]
-; VBITS_GE_512-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_512-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_512-NEXT: stp w10, w9, [sp, #8]
-; VBITS_GE_512-NEXT: stp w8, w11, [sp]
-; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [sp]
-; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT: and z0.s, z0.s, #0x1
-; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z0.s, #0
-; VBITS_GE_512-NEXT: sel z0.s, p1, z1.s, z2.s
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
+; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT: mov sp, x29
-; VBITS_GE_512-NEXT: .cfi_def_cfa wsp, 16
-; VBITS_GE_512-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_512-NEXT: .cfi_def_cfa_offset 0
-; VBITS_GE_512-NEXT: .cfi_restore w30
-; VBITS_GE_512-NEXT: .cfi_restore w29
; VBITS_GE_512-NEXT: ret
- %mask = load <16 x i1>, <16 x i1>* %c
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
+ %mask = icmp eq <16 x i32> %op1, %op2
%sel = select <16 x i1> %mask, <16 x i32> %op1, <16 x i32> %op2
store <16 x i32> %sel, <16 x i32>* %a
ret void
}
-define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, <32 x i1>* %c) #0 {
-; VBITS_GE_1024-LABEL: select_v32i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_1024-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_1024-NEXT: mov x29, sp
-; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_1024-NEXT: .cfi_offset w30, -8
-; VBITS_GE_1024-NEXT: .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT: sub x9, sp, #240
-; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80
-; VBITS_GE_1024-NEXT: ldr w8, [x2]
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ptrue p1.s
-; VBITS_GE_1024-NEXT: asr w9, w8, #31
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #30, #1
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #29, #1
-; VBITS_GE_1024-NEXT: sbfx w12, w8, #28, #1
-; VBITS_GE_1024-NEXT: stp w10, w9, [sp, #120]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #27, #1
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #26, #1
-; VBITS_GE_1024-NEXT: stp w12, w11, [sp, #112]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #25, #1
-; VBITS_GE_1024-NEXT: sbfx w12, w8, #24, #1
-; VBITS_GE_1024-NEXT: stp w10, w9, [sp, #104]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #23, #1
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #22, #1
-; VBITS_GE_1024-NEXT: stp w12, w11, [sp, #96]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #21, #1
-; VBITS_GE_1024-NEXT: sbfx w12, w8, #20, #1
-; VBITS_GE_1024-NEXT: stp w10, w9, [sp, #88]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #19, #1
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #18, #1
-; VBITS_GE_1024-NEXT: stp w12, w11, [sp, #80]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #17, #1
-; VBITS_GE_1024-NEXT: sbfx w12, w8, #16, #1
-; VBITS_GE_1024-NEXT: stp w10, w9, [sp, #72]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_1024-NEXT: stp w12, w11, [sp, #64]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_1024-NEXT: sbfx w12, w8, #12, #1
-; VBITS_GE_1024-NEXT: stp w10, w9, [sp, #56]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #11, #1
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #10, #1
-; VBITS_GE_1024-NEXT: stp w12, w11, [sp, #48]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #9, #1
-; VBITS_GE_1024-NEXT: sbfx w12, w8, #8, #1
-; VBITS_GE_1024-NEXT: stp w10, w9, [sp, #40]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #7, #1
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #6, #1
-; VBITS_GE_1024-NEXT: stp w12, w11, [sp, #32]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #5, #1
-; VBITS_GE_1024-NEXT: sbfx w12, w8, #4, #1
-; VBITS_GE_1024-NEXT: stp w10, w9, [sp, #24]
-; VBITS_GE_1024-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_1024-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_1024-NEXT: stp w12, w11, [sp, #16]
-; VBITS_GE_1024-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_1024-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_1024-NEXT: stp w10, w9, [sp, #8]
-; VBITS_GE_1024-NEXT: stp w8, w11, [sp]
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [sp]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z2.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: and z0.s, z0.s, #0x1
-; VBITS_GE_1024-NEXT: cmpne p1.s, p1/z, z0.s, #0
-; VBITS_GE_1024-NEXT: sel z0.s, p1, z1.s, z2.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: mov sp, x29
-; VBITS_GE_1024-NEXT: .cfi_def_cfa wsp, 16
-; VBITS_GE_1024-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_1024-NEXT: .cfi_def_cfa_offset 0
-; VBITS_GE_1024-NEXT: .cfi_restore w30
-; VBITS_GE_1024-NEXT: .cfi_restore w29
-; VBITS_GE_1024-NEXT: ret
- %mask = load <32 x i1>, <32 x i1>* %c
+define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
+ %mask = icmp eq <32 x i32> %op1, %op2
%sel = select <32 x i1> %mask, <32 x i32> %op1, <32 x i32> %op2
store <32 x i32> %sel, <32 x i32>* %a
ret void
}
-define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, <64 x i1>* %c) #0 {
-; VBITS_GE_2048-LABEL: select_v64i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_2048-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_2048-NEXT: mov x29, sp
-; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_2048-NEXT: .cfi_offset w30, -8
-; VBITS_GE_2048-NEXT: .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT: sub x9, sp, #496
-; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00
-; VBITS_GE_2048-NEXT: ldr x8, [x2]
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ptrue p1.s
-; VBITS_GE_2048-NEXT: asr x9, x8, #63
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT: sbfx x12, x8, #60, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #248]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #59, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #58, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #240]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #57, #1
-; VBITS_GE_2048-NEXT: sbfx x12, x8, #56, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #232]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #55, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #54, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #224]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #53, #1
-; VBITS_GE_2048-NEXT: sbfx x12, x8, #52, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #216]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #208]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT: sbfx x12, x8, #48, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #200]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #47, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #46, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #192]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #45, #1
-; VBITS_GE_2048-NEXT: sbfx x12, x8, #44, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #184]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #43, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #42, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #176]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #41, #1
-; VBITS_GE_2048-NEXT: sbfx x12, x8, #40, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #168]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #160]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT: sbfx x12, x8, #36, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #152]
-; VBITS_GE_2048-NEXT: sbfx x9, x8, #35, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x8, #34, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #144]
-; VBITS_GE_2048-NEXT: sbfx x11, x8, #33, #1
-; VBITS_GE_2048-NEXT: sbfx x12, x8, #32, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #136]
-; VBITS_GE_2048-NEXT: asr w9, w8, #31
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #30, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #128]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #29, #1
-; VBITS_GE_2048-NEXT: sbfx w12, w8, #28, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #120]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #112]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT: sbfx w12, w8, #24, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #104]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #23, #1
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #22, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #96]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #21, #1
-; VBITS_GE_2048-NEXT: sbfx w12, w8, #20, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #88]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #19, #1
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #18, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #80]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #17, #1
-; VBITS_GE_2048-NEXT: sbfx w12, w8, #16, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #72]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #64]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT: sbfx w12, w8, #12, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #56]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #11, #1
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #10, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #48]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #9, #1
-; VBITS_GE_2048-NEXT: sbfx w12, w8, #8, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #40]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #7, #1
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #6, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #32]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #5, #1
-; VBITS_GE_2048-NEXT: sbfx w12, w8, #4, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #24]
-; VBITS_GE_2048-NEXT: sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT: sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT: stp w12, w11, [sp, #16]
-; VBITS_GE_2048-NEXT: sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT: stp w10, w9, [sp, #8]
-; VBITS_GE_2048-NEXT: stp w8, w11, [sp]
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [sp]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z2.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: and z0.s, z0.s, #0x1
-; VBITS_GE_2048-NEXT: cmpne p1.s, p1/z, z0.s, #0
-; VBITS_GE_2048-NEXT: sel z0.s, p1, z1.s, z2.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: mov sp, x29
-; VBITS_GE_2048-NEXT: .cfi_def_cfa wsp, 16
-; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_2048-NEXT: .cfi_def_cfa_offset 0
-; VBITS_GE_2048-NEXT: .cfi_restore w30
-; VBITS_GE_2048-NEXT: .cfi_restore w29
-; VBITS_GE_2048-NEXT: ret
- %mask = load <64 x i1>, <64 x i1>* %c
+define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v64i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
+ %mask = icmp eq <64 x i32> %op1, %op2
%sel = select <64 x i1> %mask, <64 x i32> %op1, <64 x i32> %op2
store <64 x i32> %sel, <64 x i32>* %a
ret void
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) #0 {
+define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@@ -2164,7 +357,7 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) #
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) #0 {
+define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.2d, v2.2s, #0
@@ -2176,322 +369,89 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) #
ret <2 x i64> %sel
}
-define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, <4 x i1>* %c) #0 {
+define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: .cfi_def_cfa w29, 16
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: sub x9, sp, #48
-; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT: ldrb w8, [x2]
; CHECK-NEXT: ptrue p0.d, vl4
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: lsr w9, w8, #3
-; CHECK-NEXT: lsr w10, w8, #2
-; CHECK-NEXT: sbfx x11, x8, #0, #1
-; CHECK-NEXT: lsr w8, w8, #1
-; CHECK-NEXT: sbfx x9, x9, #0, #1
-; CHECK-NEXT: sbfx x10, x10, #0, #1
-; CHECK-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NEXT: stp x10, x9, [sp, #16]
-; CHECK-NEXT: stp x11, x8, [sp]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1]
-; CHECK-NEXT: and z0.d, z0.d, #0x1
-; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0
-; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: .cfi_def_cfa wsp, 16
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: .cfi_restore w30
-; CHECK-NEXT: .cfi_restore w29
; CHECK-NEXT: ret
- %mask = load <4 x i1>, <4 x i1>* %c
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
+ %mask = icmp eq <4 x i64> %op1, %op2
%sel = select <4 x i1> %mask, <4 x i64> %op1, <4 x i64> %op2
store <4 x i64> %sel, <4 x i64>* %a
ret void
}
-define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, <8 x i1>* %c) #0 {
+define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
+; VBITS_GE_256-LABEL: select_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d
+; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z3.d
+; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
+; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: select_v8i64:
; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_512-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_512-NEXT: mov x29, sp
-; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_512-NEXT: .cfi_offset w30, -8
-; VBITS_GE_512-NEXT: .cfi_offset w29, -16
-; VBITS_GE_512-NEXT: sub x9, sp, #112
-; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0
-; VBITS_GE_512-NEXT: ldrb w8, [x2]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ptrue p1.d
-; VBITS_GE_512-NEXT: lsr w9, w8, #7
-; VBITS_GE_512-NEXT: lsr w10, w8, #6
-; VBITS_GE_512-NEXT: lsr w11, w8, #5
-; VBITS_GE_512-NEXT: lsr w12, w8, #4
-; VBITS_GE_512-NEXT: sbfx x9, x9, #0, #1
-; VBITS_GE_512-NEXT: sbfx x10, x10, #0, #1
-; VBITS_GE_512-NEXT: sbfx x11, x11, #0, #1
-; VBITS_GE_512-NEXT: sbfx x12, x12, #0, #1
-; VBITS_GE_512-NEXT: lsr w13, w8, #3
-; VBITS_GE_512-NEXT: stp x10, x9, [sp, #48]
-; VBITS_GE_512-NEXT: lsr w9, w8, #2
-; VBITS_GE_512-NEXT: stp x12, x11, [sp, #32]
-; VBITS_GE_512-NEXT: sbfx x11, x8, #0, #1
-; VBITS_GE_512-NEXT: lsr w8, w8, #1
-; VBITS_GE_512-NEXT: sbfx x10, x13, #0, #1
-; VBITS_GE_512-NEXT: sbfx x9, x9, #0, #1
-; VBITS_GE_512-NEXT: sbfx x8, x8, #0, #1
-; VBITS_GE_512-NEXT: stp x9, x10, [sp, #16]
-; VBITS_GE_512-NEXT: stp x11, x8, [sp]
-; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [sp]
-; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT: and z0.d, z0.d, #0x1
-; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z0.d, #0
-; VBITS_GE_512-NEXT: sel z0.d, p1, z1.d, z2.d
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
+; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_512-NEXT: mov sp, x29
-; VBITS_GE_512-NEXT: .cfi_def_cfa wsp, 16
-; VBITS_GE_512-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_512-NEXT: .cfi_def_cfa_offset 0
-; VBITS_GE_512-NEXT: .cfi_restore w30
-; VBITS_GE_512-NEXT: .cfi_restore w29
; VBITS_GE_512-NEXT: ret
- %mask = load <8 x i1>, <8 x i1>* %c
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
+ %mask = icmp eq <8 x i64> %op1, %op2
%sel = select <8 x i1> %mask, <8 x i64> %op1, <8 x i64> %op2
store <8 x i64> %sel, <8 x i64>* %a
ret void
}
-define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, <16 x i1>* %c) #0 {
-; VBITS_GE_1024-LABEL: select_v16i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_1024-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_1024-NEXT: mov x29, sp
-; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_1024-NEXT: .cfi_offset w30, -8
-; VBITS_GE_1024-NEXT: .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT: sub x9, sp, #240
-; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80
-; VBITS_GE_1024-NEXT: ldrh w8, [x2]
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ptrue p1.d
-; VBITS_GE_1024-NEXT: lsr w9, w8, #15
-; VBITS_GE_1024-NEXT: lsr w10, w8, #14
-; VBITS_GE_1024-NEXT: lsr w11, w8, #13
-; VBITS_GE_1024-NEXT: lsr w12, w8, #12
-; VBITS_GE_1024-NEXT: sbfx x9, x9, #0, #1
-; VBITS_GE_1024-NEXT: sbfx x10, x10, #0, #1
-; VBITS_GE_1024-NEXT: sbfx x11, x11, #0, #1
-; VBITS_GE_1024-NEXT: sbfx x12, x12, #0, #1
-; VBITS_GE_1024-NEXT: lsr w13, w8, #11
-; VBITS_GE_1024-NEXT: lsr w14, w8, #10
-; VBITS_GE_1024-NEXT: stp x10, x9, [sp, #112]
-; VBITS_GE_1024-NEXT: lsr w9, w8, #9
-; VBITS_GE_1024-NEXT: stp x12, x11, [sp, #96]
-; VBITS_GE_1024-NEXT: lsr w12, w8, #8
-; VBITS_GE_1024-NEXT: sbfx x10, x13, #0, #1
-; VBITS_GE_1024-NEXT: sbfx x11, x14, #0, #1
-; VBITS_GE_1024-NEXT: sbfx x9, x9, #0, #1
-; VBITS_GE_1024-NEXT: sbfx x12, x12, #0, #1
-; VBITS_GE_1024-NEXT: lsr w13, w8, #3
-; VBITS_GE_1024-NEXT: stp x11, x10, [sp, #80]
-; VBITS_GE_1024-NEXT: lsr w10, w8, #6
-; VBITS_GE_1024-NEXT: stp x12, x9, [sp, #64]
-; VBITS_GE_1024-NEXT: lsr w9, w8, #7
-; VBITS_GE_1024-NEXT: lsr w11, w8, #5
-; VBITS_GE_1024-NEXT: lsr w12, w8, #4
-; VBITS_GE_1024-NEXT: sbfx x9, x9, #0, #1
-; VBITS_GE_1024-NEXT: sbfx x10, x10, #0, #1
-; VBITS_GE_1024-NEXT: sbfx x11, x11, #0, #1
-; VBITS_GE_1024-NEXT: sbfx x12, x12, #0, #1
-; VBITS_GE_1024-NEXT: stp x10, x9, [sp, #48]
-; VBITS_GE_1024-NEXT: lsr w10, w8, #2
-; VBITS_GE_1024-NEXT: stp x12, x11, [sp, #32]
-; VBITS_GE_1024-NEXT: sbfx x11, x8, #0, #1
-; VBITS_GE_1024-NEXT: lsr w8, w8, #1
-; VBITS_GE_1024-NEXT: sbfx x9, x13, #0, #1
-; VBITS_GE_1024-NEXT: sbfx x10, x10, #0, #1
-; VBITS_GE_1024-NEXT: sbfx x8, x8, #0, #1
-; VBITS_GE_1024-NEXT: stp x10, x9, [sp, #16]
-; VBITS_GE_1024-NEXT: stp x11, x8, [sp]
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [sp]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: and z0.d, z0.d, #0x1
-; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z0.d, #0
-; VBITS_GE_1024-NEXT: sel z0.d, p1, z1.d, z2.d
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: mov sp, x29
-; VBITS_GE_1024-NEXT: .cfi_def_cfa wsp, 16
-; VBITS_GE_1024-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_1024-NEXT: .cfi_def_cfa_offset 0
-; VBITS_GE_1024-NEXT: .cfi_restore w30
-; VBITS_GE_1024-NEXT: .cfi_restore w29
-; VBITS_GE_1024-NEXT: ret
- %mask = load <16 x i1>, <16 x i1>* %c
+define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
+ %mask = icmp eq <16 x i64> %op1, %op2
%sel = select <16 x i1> %mask, <16 x i64> %op1, <16 x i64> %op2
store <16 x i64> %sel, <16 x i64>* %a
ret void
}
-define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, <32 x i1>* %c) #0 {
-; VBITS_GE_2048-LABEL: select_v32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_2048-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_2048-NEXT: mov x29, sp
-; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_2048-NEXT: .cfi_offset w30, -8
-; VBITS_GE_2048-NEXT: .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT: sub x9, sp, #496
-; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00
-; VBITS_GE_2048-NEXT: ldr w8, [x2]
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d
-; VBITS_GE_2048-NEXT: ubfx x9, x8, #31, #1
-; VBITS_GE_2048-NEXT: ubfx x10, x8, #30, #2
-; VBITS_GE_2048-NEXT: // kill: def $w9 killed $w9 killed $x9 def $x9
-; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT: ubfx x11, x8, #29, #3
-; VBITS_GE_2048-NEXT: ubfx x12, x8, #28, #4
-; VBITS_GE_2048-NEXT: sbfx x9, x9, #0, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT: ubfx x13, x8, #27, #5
-; VBITS_GE_2048-NEXT: ubfx x14, x8, #26, #6
-; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT: stp x10, x9, [sp, #240]
-; VBITS_GE_2048-NEXT: sbfx x9, x11, #0, #1
-; VBITS_GE_2048-NEXT: sbfx x11, x12, #0, #1
-; VBITS_GE_2048-NEXT: sbfx x12, x13, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x10, x8, #25, #7
-; VBITS_GE_2048-NEXT: ubfx x13, x8, #23, #9
-; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT: stp x11, x9, [sp, #224]
-; VBITS_GE_2048-NEXT: sbfx x9, x14, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x11, x8, #24, #8
-; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT: stp x9, x12, [sp, #208]
-; VBITS_GE_2048-NEXT: sbfx x9, x10, #0, #1
-; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x10, x8, #22, #10
-; VBITS_GE_2048-NEXT: sbfx x12, x13, #0, #1
-; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT: ubfx x13, x8, #21, #11
-; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT: stp x11, x9, [sp, #192]
-; VBITS_GE_2048-NEXT: sbfx x9, x10, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x10, x8, #20, #12
-; VBITS_GE_2048-NEXT: ubfx x11, x8, #19, #13
-; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT: stp x9, x12, [sp, #176]
-; VBITS_GE_2048-NEXT: sbfx x9, x13, #0, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x12, x8, #18, #14
-; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT: ubfx x13, x8, #17, #15
-; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT: stp x10, x9, [sp, #160]
-; VBITS_GE_2048-NEXT: sbfx x9, x12, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x10, x8, #16, #16
-; VBITS_GE_2048-NEXT: ubfx x12, x8, #15, #17
-; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT: stp x9, x11, [sp, #144]
-; VBITS_GE_2048-NEXT: sbfx x9, x13, #0, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x11, x8, #14, #18
-; VBITS_GE_2048-NEXT: sbfx x12, x12, #0, #1
-; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT: ubfx x13, x8, #13, #19
-; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT: stp x10, x9, [sp, #128]
-; VBITS_GE_2048-NEXT: sbfx x9, x11, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x10, x8, #12, #20
-; VBITS_GE_2048-NEXT: ubfx x11, x8, #11, #21
-; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT: stp x9, x12, [sp, #112]
-; VBITS_GE_2048-NEXT: sbfx x9, x13, #0, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x12, x8, #10, #22
-; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT: ubfx x13, x8, #9, #23
-; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT: stp x10, x9, [sp, #96]
-; VBITS_GE_2048-NEXT: sbfx x9, x12, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x10, x8, #8, #24
-; VBITS_GE_2048-NEXT: ubfx x12, x8, #7, #25
-; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT: stp x9, x11, [sp, #80]
-; VBITS_GE_2048-NEXT: sbfx x9, x13, #0, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x11, x8, #6, #26
-; VBITS_GE_2048-NEXT: sbfx x12, x12, #0, #1
-; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT: ubfx x13, x8, #5, #27
-; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT: stp x10, x9, [sp, #64]
-; VBITS_GE_2048-NEXT: sbfx x9, x11, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x10, x8, #4, #28
-; VBITS_GE_2048-NEXT: ubfx x11, x8, #3, #29
-; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT: stp x9, x12, [sp, #48]
-; VBITS_GE_2048-NEXT: sbfx x9, x13, #0, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT: ubfx x12, x8, #2, #30
-; VBITS_GE_2048-NEXT: ubfx x13, x8, #1, #31
-; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT: sbfx x8, x8, #0, #1
-; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT: stp x10, x9, [sp, #32]
-; VBITS_GE_2048-NEXT: sbfx x9, x12, #0, #1
-; VBITS_GE_2048-NEXT: sbfx x10, x13, #0, #1
-; VBITS_GE_2048-NEXT: stp x9, x11, [sp, #16]
-; VBITS_GE_2048-NEXT: stp x8, x10, [sp]
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [sp]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: and z0.d, z0.d, #0x1
-; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0
-; VBITS_GE_2048-NEXT: sel z0.d, p1, z1.d, z2.d
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: mov sp, x29
-; VBITS_GE_2048-NEXT: .cfi_def_cfa wsp, 16
-; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_2048-NEXT: .cfi_def_cfa_offset 0
-; VBITS_GE_2048-NEXT: .cfi_restore w30
-; VBITS_GE_2048-NEXT: .cfi_restore w29
-; VBITS_GE_2048-NEXT: ret
- %mask = load <32 x i1>, <32 x i1>* %c
+define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
+ %mask = icmp eq <32 x i64> %op1, %op2
%sel = select <32 x i1> %mask, <32 x i64> %op1, <32 x i64> %op2
store <32 x i64> %sel, <32 x i64>* %a
ret void
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
index a4aba59034e7..485df06b8964 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
@@ -1,35 +1,29 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
-
-; VBYTES represents the useful byte size of a vector register from the code
-; generator's point of view. It is clamped to power-of-2 values because
-; only power-of-2 vector lengths are considered legal, regardless of the
-; user specified vector length.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
; Don't use SVE for 64-bit vectors.
define <2 x float> @load_v2f32(<2 x float>* %a) #0 {
; CHECK-LABEL: load_v2f32:
-; CHECK: ldr d0, [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ret
%load = load <2 x float>, <2 x float>* %a
ret <2 x float> %load
}
@@ -37,66 +31,164 @@ define <2 x float> @load_v2f32(<2 x float>* %a) #0 {
; Don't use SVE for 128-bit vectors.
define <4 x float> @load_v4f32(<4 x float>* %a) #0 {
; CHECK-LABEL: load_v4f32:
-; CHECK: ldr q0, [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ret
%load = load <4 x float>, <4 x float>* %a
ret <4 x float> %load
}
define <8 x float> @load_v8f32(<8 x float>* %a) #0 {
; CHECK-LABEL: load_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%load = load <8 x float>, <8 x float>* %a
ret <8 x float> %load
}
define <16 x float> @load_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: load_v16f32:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
-; VBITS_LE_256-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
-; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A1]], lsl #2]
-; CHECK: ret
+; VBITS_GE_256-LABEL: load_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: load_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT: ret
+;
+; VBITS_GE_1024-LABEL: load_v16f32:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
+; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_1024-NEXT: ret
+;
+; VBITS_GE_2048-LABEL: load_v16f32:
+; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.s, vl16
+; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_2048-NEXT: ret
%load = load <16 x float>, <16 x float>* %a
ret <16 x float> %load
}
define <32 x float> @load_v32f32(<32 x float>* %a) #0 {
-; CHECK-LABEL: load_v32f32:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
-; VBITS_LE_512-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
-; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A1]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]]
-; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A2]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]]
-; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A3]], lsl #2]
-; CHECK: ret
+; VBITS_GE_256-LABEL: load_v32f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #16
+; VBITS_GE_256-NEXT: mov x10, #24
+; VBITS_GE_256-NEXT: mov x11, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x10, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x11, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: load_v32f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: mov x9, #16
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_512-NEXT: ret
+;
+; VBITS_GE_1024-LABEL: load_v32f32:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_1024-NEXT: ret
+;
+; VBITS_GE_2048-LABEL: load_v32f32:
+; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
+; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_2048-NEXT: ret
%load = load <32 x float>, <32 x float>* %a
ret <32 x float> %load
}
define <64 x float> @load_v64f32(<64 x float>* %a) #0 {
-; CHECK-LABEL: load_v64f32:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
-; VBITS_LE_1024-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
-; VBITS_LE_1024-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A1]], lsl #2]
-; VBITS_LE_512-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]]
-; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A2]], lsl #2]
-; VBITS_LE_512-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]]
-; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A3]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A4:[0-9]+]], #[[#mul(div(VBYTES,4),4)]]
-; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A4]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A5:[0-9]+]], #[[#mul(div(VBYTES,4),5)]]
-; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A5]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A6:[0-9]+]], #[[#mul(div(VBYTES,4),6)]]
-; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A6]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A7:[0-9]+]], #[[#mul(div(VBYTES,4),7)]]
-; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A7]], lsl #2]
-; CHECK: ret
+; VBITS_GE_256-LABEL: load_v64f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: mov x10, #48
+; VBITS_GE_256-NEXT: mov x11, #56
+; VBITS_GE_256-NEXT: mov x12, #32
+; VBITS_GE_256-NEXT: mov x13, #40
+; VBITS_GE_256-NEXT: mov x14, #16
+; VBITS_GE_256-NEXT: mov x15, #24
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x10, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x11, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x12, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x13, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x14, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x15, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x13, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x12, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x15, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x14, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: load_v64f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: mov x9, #32
+; VBITS_GE_512-NEXT: mov x10, #48
+; VBITS_GE_512-NEXT: mov x11, #16
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
+; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
+; VBITS_GE_512-NEXT: ld1w { z3.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x8, x10, lsl #2]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x8, x11, lsl #2]
+; VBITS_GE_512-NEXT: st1w { z3.s }, p0, [x8]
+; VBITS_GE_512-NEXT: ret
+;
+; VBITS_GE_1024-LABEL: load_v64f32:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: mov x9, #32
+; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_1024-NEXT: ret
+;
+; VBITS_GE_2048-LABEL: load_v64f32:
+; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
+; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_2048-NEXT: ret
%load = load <64 x float>, <64 x float>* %a
ret <64 x float> %load
}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
index 28f354f47b19..743aa295d75e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
@@ -1,343 +1,363 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; ANDV
;
; No single instruction NEON ANDV support. Use SVE.
-define i8 @andv_v8i8(<8 x i8> %a) #0 {
+define i8 @andv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: andv_v8i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
-; CHECK: andv b[[REDUCE:[0-9]+]], [[PG]], z0.b
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: andv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
ret i8 %res
}
; No single instruction NEON ANDV support. Use SVE.
-define i8 @andv_v16i8(<16 x i8> %a) #0 {
+define i8 @andv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: andv_v16i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
-; CHECK: andv b[[REDUCE:[0-9]+]], [[PG]], z0.b
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: andv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
ret i8 %res
}
-define i8 @andv_v32i8(<32 x i8>* %a) #0 {
+define i8 @andv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: andv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: andv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op)
ret i8 %res
}
define i8 @andv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: andv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: andv b[[REDUCE:[0-9]+]], [[PG]], [[AND]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
-
+; VBITS_GE_256-LABEL: andv_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: andv b0, p0, z0.b
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: andv_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: andv b0, p0, z0.b
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %op)
ret i8 %res
}
-define i8 @andv_v128i8(<128 x i8>* %a) #0 {
+define i8 @andv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: andv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: andv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %op)
ret i8 %res
}
-define i8 @andv_v256i8(<256 x i8>* %a) #0 {
+define i8 @andv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: andv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: andv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %op)
ret i8 %res
}
; No single instruction NEON ANDV support. Use SVE.
-define i16 @andv_v4i16(<4 x i16> %a) #0 {
+define i16 @andv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: andv_v4i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK: andv h[[REDUCE:[0-9]+]], [[PG]], z0.h
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: andv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a)
ret i16 %res
}
; No single instruction NEON ANDV support. Use SVE.
-define i16 @andv_v8i16(<8 x i16> %a) #0 {
+define i16 @andv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: andv_v8i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK: andv h[[REDUCE:[0-9]+]], [[PG]], z0.h
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: andv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a)
ret i16 %res
}
-define i16 @andv_v16i16(<16 x i16>* %a) #0 {
+define i16 @andv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: andv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: andv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op)
ret i16 %res
}
define i16 @andv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: andv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: andv h[[REDUCE:[0-9]+]], [[PG]], [[AND]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: andv_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: andv h0, p0, z0.h
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: andv_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: andv h0, p0, z0.h
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %op)
ret i16 %res
}
-define i16 @andv_v64i16(<64 x i16>* %a) #0 {
+define i16 @andv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: andv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: andv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %op)
ret i16 %res
}
-define i16 @andv_v128i16(<128 x i16>* %a) #0 {
+define i16 @andv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: andv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: andv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %op)
ret i16 %res
}
; No single instruction NEON ANDV support. Use SVE.
-define i32 @andv_v2i32(<2 x i32> %a) #0 {
+define i32 @andv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: andv_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK: andv [[REDUCE:s[0-9]+]], [[PG]], z0.s
-; CHECK: fmov w0, [[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: andv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
ret i32 %res
}
; No single instruction NEON ANDV support. Use SVE.
-define i32 @andv_v4i32(<4 x i32> %a) #0 {
+define i32 @andv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: andv_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK: andv [[REDUCE:s[0-9]+]], [[PG]], z0.s
-; CHECK: fmov w0, [[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: andv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
ret i32 %res
}
-define i32 @andv_v8i32(<8 x i32>* %a) #0 {
+define i32 @andv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: andv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: andv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op)
ret i32 %res
}
define i32 @andv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: andv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: andv [[REDUCE:s[0-9]+]], [[PG]], [[AND]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: andv_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: andv s0, p0, z0.s
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: andv_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: andv s0, p0, z0.s
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %op)
ret i32 %res
}
-define i32 @andv_v32i32(<32 x i32>* %a) #0 {
+define i32 @andv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: andv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: andv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %op)
ret i32 %res
}
-define i32 @andv_v64i32(<64 x i32>* %a) #0 {
+define i32 @andv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: andv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: andv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %op)
ret i32 %res
}
; Nothing to do for single element vectors.
-define i64 @andv_v1i64(<1 x i64> %a) #0 {
+define i64 @andv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: andv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a)
ret i64 %res
}
; Use SVE for 128-bit vectors
-define i64 @andv_v2i64(<2 x i64> %a) #0 {
+define i64 @andv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: andv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK: andv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK: fmov x0, [[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: andv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
ret i64 %res
}
-define i64 @andv_v4i64(<4 x i64>* %a) #0 {
+define i64 @andv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: andv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: andv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op)
ret i64 %res
}
define i64 @andv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: andv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: andv [[REDUCE:d[0-9]+]], [[PG]], [[AND]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: andv_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: andv d0, p0, z0.d
+; VBITS_GE_256-NEXT: fmov x0, d0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: andv_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: andv d0, p0, z0.d
+; VBITS_GE_512-NEXT: fmov x0, d0
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %op)
ret i64 %res
}
-define i64 @andv_v16i64(<16 x i64>* %a) #0 {
+define i64 @andv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: andv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: andv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %op)
ret i64 %res
}
-define i64 @andv_v32i64(<32 x i64>* %a) #0 {
+define i64 @andv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: andv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: andv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %op)
ret i64 %res
@@ -348,319 +368,354 @@ define i64 @andv_v32i64(<32 x i64>* %a) #0 {
;
; No single instruction NEON EORV support. Use SVE.
-define i8 @eorv_v8i8(<8 x i8> %a) #0 {
+define i8 @eorv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: eorv_v8i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
-; CHECK: eorv b[[REDUCE:[0-9]+]], [[PG]], z0.b
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: eorv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
ret i8 %res
}
; No single instruction NEON EORV support. Use SVE.
-define i8 @eorv_v16i8(<16 x i8> %a) #0 {
+define i8 @eorv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: eorv_v16i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
-; CHECK: eorv b[[REDUCE:[0-9]+]], [[PG]], z0.b
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: eorv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
ret i8 %res
}
-define i8 @eorv_v32i8(<32 x i8>* %a) #0 {
+define i8 @eorv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: eorv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: eorv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op)
ret i8 %res
}
define i8 @eorv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: eorv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: eorv b[[REDUCE:[0-9]+]], [[PG]], [[EOR]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
-
+; VBITS_GE_256-LABEL: eorv_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: eorv b0, p0, z0.b
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: eorv_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: eorv b0, p0, z0.b
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %op)
ret i8 %res
}
-define i8 @eorv_v128i8(<128 x i8>* %a) #0 {
+define i8 @eorv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: eorv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: eorv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %op)
ret i8 %res
}
-define i8 @eorv_v256i8(<256 x i8>* %a) #0 {
+define i8 @eorv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: eorv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: eorv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %op)
ret i8 %res
}
; No single instruction NEON EORV support. Use SVE.
-define i16 @eorv_v4i16(<4 x i16> %a) #0 {
+define i16 @eorv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: eorv_v4i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK: eorv h[[REDUCE:[0-9]+]], [[PG]], z0.h
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: eorv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a)
ret i16 %res
}
; No single instruction NEON EORV support. Use SVE.
-define i16 @eorv_v8i16(<8 x i16> %a) #0 {
+define i16 @eorv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: eorv_v8i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK: eorv h[[REDUCE:[0-9]+]], [[PG]], z0.h
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: eorv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a)
ret i16 %res
}
-define i16 @eorv_v16i16(<16 x i16>* %a) #0 {
+define i16 @eorv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: eorv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: eorv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op)
ret i16 %res
}
define i16 @eorv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: eorv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: eorv h[[REDUCE:[0-9]+]], [[PG]], [[EOR]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: eorv_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: eorv h0, p0, z0.h
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: eorv_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: eorv h0, p0, z0.h
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %op)
ret i16 %res
}
-define i16 @eorv_v64i16(<64 x i16>* %a) #0 {
+define i16 @eorv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: eorv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: eorv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %op)
ret i16 %res
}
-define i16 @eorv_v128i16(<128 x i16>* %a) #0 {
+define i16 @eorv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: eorv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: eorv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %op)
ret i16 %res
}
; No single instruction NEON EORV support. Use SVE.
-define i32 @eorv_v2i32(<2 x i32> %a) #0 {
+define i32 @eorv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: eorv_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK: eorv [[REDUCE:s[0-9]+]], [[PG]], z0.s
-; CHECK: fmov w0, [[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: eorv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
ret i32 %res
}
; No single instruction NEON EORV support. Use SVE.
-define i32 @eorv_v4i32(<4 x i32> %a) #0 {
+define i32 @eorv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: eorv_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK: eorv [[REDUCE:s[0-9]+]], [[PG]], z0.s
-; CHECK: fmov w0, [[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: eorv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
ret i32 %res
}
-define i32 @eorv_v8i32(<8 x i32>* %a) #0 {
+define i32 @eorv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: eorv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: eorv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op)
ret i32 %res
}
define i32 @eorv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: eorv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: eorv [[REDUCE:s[0-9]+]], [[PG]], [[EOR]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: eorv_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: eorv s0, p0, z0.s
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: eorv_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: eorv s0, p0, z0.s
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %op)
ret i32 %res
}
-define i32 @eorv_v32i32(<32 x i32>* %a) #0 {
+define i32 @eorv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: eorv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: eorv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %op)
ret i32 %res
}
-define i32 @eorv_v64i32(<64 x i32>* %a) #0 {
+define i32 @eorv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: eorv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: eorv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %op)
ret i32 %res
}
; Nothing to do for single element vectors.
-define i64 @eorv_v1i64(<1 x i64> %a) #0 {
+define i64 @eorv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: eorv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %a)
ret i64 %res
}
; Use SVE for 128-bit vectors
-define i64 @eorv_v2i64(<2 x i64> %a) #0 {
+define i64 @eorv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: eorv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK: eorv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK: fmov x0, [[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: eorv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
ret i64 %res
}
-define i64 @eorv_v4i64(<4 x i64>* %a) #0 {
+define i64 @eorv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: eorv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: eorv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op)
ret i64 %res
}
define i64 @eorv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: eorv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: eorv [[REDUCE:d[0-9]+]], [[PG]], [[EOR]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: eorv_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: eorv d0, p0, z0.d
+; VBITS_GE_256-NEXT: fmov x0, d0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: eorv_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: eorv d0, p0, z0.d
+; VBITS_GE_512-NEXT: fmov x0, d0
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %op)
ret i64 %res
}
-define i64 @eorv_v16i64(<16 x i64>* %a) #0 {
+define i64 @eorv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: eorv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: eorv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %op)
ret i64 %res
}
-define i64 @eorv_v32i64(<32 x i64>* %a) #0 {
+define i64 @eorv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: eorv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: eorv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %op)
ret i64 %res
@@ -671,319 +726,354 @@ define i64 @eorv_v32i64(<32 x i64>* %a) #0 {
;
; No single instruction NEON ORV support. Use SVE.
-define i8 @orv_v8i8(<8 x i8> %a) #0 {
+define i8 @orv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: orv_v8i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
-; CHECK: orv b[[REDUCE:[0-9]+]], [[PG]], z0.b
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: orv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
ret i8 %res
}
; No single instruction NEON ORV support. Use SVE.
-define i8 @orv_v16i8(<16 x i8> %a) #0 {
+define i8 @orv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: orv_v16i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
-; CHECK: orv b[[REDUCE:[0-9]+]], [[PG]], z0.b
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: orv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
ret i8 %res
}
-define i8 @orv_v32i8(<32 x i8>* %a) #0 {
+define i8 @orv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: orv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: orv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op)
ret i8 %res
}
define i8 @orv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: orv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: orv b[[REDUCE:[0-9]+]], [[PG]], [[OR]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
-
+; VBITS_GE_256-LABEL: orv_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: orv b0, p0, z0.b
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: orv_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: orv b0, p0, z0.b
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %op)
ret i8 %res
}
-define i8 @orv_v128i8(<128 x i8>* %a) #0 {
+define i8 @orv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: orv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: orv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %op)
ret i8 %res
}
-define i8 @orv_v256i8(<256 x i8>* %a) #0 {
+define i8 @orv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: orv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: orv b0, p0, z0.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %op)
ret i8 %res
}
; No single instruction NEON ORV support. Use SVE.
-define i16 @orv_v4i16(<4 x i16> %a) #0 {
+define i16 @orv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: orv_v4i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK: orv h[[REDUCE:[0-9]+]], [[PG]], z0.h
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: orv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
ret i16 %res
}
; No single instruction NEON ORV support. Use SVE.
-define i16 @orv_v8i16(<8 x i16> %a) #0 {
+define i16 @orv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: orv_v8i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK: orv h[[REDUCE:[0-9]+]], [[PG]], z0.h
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: orv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a)
ret i16 %res
}
-define i16 @orv_v16i16(<16 x i16>* %a) #0 {
+define i16 @orv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: orv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: orv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op)
ret i16 %res
}
define i16 @orv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: orv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: orv h[[REDUCE:[0-9]+]], [[PG]], [[OR]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: orv_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: orv h0, p0, z0.h
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: orv_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: orv h0, p0, z0.h
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %op)
ret i16 %res
}
-define i16 @orv_v64i16(<64 x i16>* %a) #0 {
+define i16 @orv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: orv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: orv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %op)
ret i16 %res
}
-define i16 @orv_v128i16(<128 x i16>* %a) #0 {
+define i16 @orv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: orv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: orv h0, p0, z0.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %op)
ret i16 %res
}
; No single instruction NEON ORV support. Use SVE.
-define i32 @orv_v2i32(<2 x i32> %a) #0 {
+define i32 @orv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: orv_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK: orv [[REDUCE:s[0-9]+]], [[PG]], z0.s
-; CHECK: fmov w0, [[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: orv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
ret i32 %res
}
; No single instruction NEON ORV support. Use SVE.
-define i32 @orv_v4i32(<4 x i32> %a) #0 {
+define i32 @orv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: orv_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK: orv [[REDUCE:s[0-9]+]], [[PG]], z0.s
-; CHECK: fmov w0, [[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: orv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
ret i32 %res
}
-define i32 @orv_v8i32(<8 x i32>* %a) #0 {
+define i32 @orv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: orv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: orv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op)
ret i32 %res
}
define i32 @orv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: orv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: orv [[REDUCE:s[0-9]+]], [[PG]], [[OR]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: orv_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: orv s0, p0, z0.s
+; VBITS_GE_256-NEXT: fmov w0, s0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: orv_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: orv s0, p0, z0.s
+; VBITS_GE_512-NEXT: fmov w0, s0
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %op)
ret i32 %res
}
-define i32 @orv_v32i32(<32 x i32>* %a) #0 {
+define i32 @orv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: orv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: orv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %op)
ret i32 %res
}
-define i32 @orv_v64i32(<64 x i32>* %a) #0 {
+define i32 @orv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: orv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: orv s0, p0, z0.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %op)
ret i32 %res
}
; Nothing to do for single element vectors.
-define i64 @orv_v1i64(<1 x i64> %a) #0 {
+define i64 @orv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: orv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %a)
ret i64 %res
}
; Use SVE for 128-bit vectors
-define i64 @orv_v2i64(<2 x i64> %a) #0 {
+define i64 @orv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
; CHECK-LABEL: orv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK: orv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK: fmov x0, [[REDUCE]]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: orv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
ret i64 %res
}
-define i64 @orv_v4i64(<4 x i64>* %a) #0 {
+define i64 @orv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: orv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: orv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op)
ret i64 %res
}
define i64 @orv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: orv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: orv [[REDUCE:d[0-9]+]], [[PG]], [[OR]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: orv_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT: orv d0, p0, z0.d
+; VBITS_GE_256-NEXT: fmov x0, d0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: orv_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: orv d0, p0, z0.d
+; VBITS_GE_512-NEXT: fmov x0, d0
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %op)
ret i64 %res
}
-define i64 @orv_v16i64(<16 x i64>* %a) #0 {
+define i64 @orv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: orv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: orv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %op)
ret i64 %res
}
-define i64 @orv_v32i64(<32 x i64>* %a) #0 {
+define i64 @orv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: orv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: orv d0, p0, z0.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %op)
ret i64 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
index 8e8500348be0..dba92869aed1 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
@@ -1,18 +1,7 @@
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@@ -20,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
; LD1B
;
-define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
+define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x1]
@@ -36,7 +25,7 @@ define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
ret void
}
-define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
+define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -54,21 +43,21 @@ define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
}
define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
-; VBITS_EQ_256-LABEL: masked_gather_v8i8:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: mov x8, #4
-; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: ld1b { z0.d }, p0/z, [z0.d]
-; VBITS_EQ_256-NEXT: ld1b { z1.d }, p0/z, [z1.d]
-; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_EQ_256-NEXT: uzp1 v0.8b, v1.8b, v0.8b
-; VBITS_EQ_256-NEXT: str d0, [x0]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: masked_gather_v8i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: ld1b { z0.d }, p0/z, [z0.d]
+; VBITS_GE_256-NEXT: ld1b { z1.d }, p0/z, [z1.d]
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: uzp1 v0.8b, v1.8b, v0.8b
+; VBITS_GE_256-NEXT: str d0, [x0]
+; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_gather_v8i8:
; VBITS_GE_512: // %bb.0:
@@ -86,17 +75,17 @@ define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
ret void
}
-define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_gather_v16i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: ld1b { z0.d }, p0/z, [z0.d]
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_1024-NEXT: str q0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
%ptrs = load <16 x i8*>, <16 x i8*>* %b
%vals = call <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
@@ -104,18 +93,18 @@ define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
ret void
}
-define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_gather_v32i8:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [z0.d]
-; VBITS_GE_2048-NEXT: ptrue p0.b, vl32
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%ptrs = load <32 x i8*>, <32 x i8*>* %b
%vals = call <32 x i8> @llvm.masked.gather.v32i8(<32 x i8*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
@@ -129,7 +118,7 @@ define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
; LD1H
;
-define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
+define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x1]
@@ -145,7 +134,7 @@ define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
ret void
}
-define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
+define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -162,21 +151,21 @@ define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
}
define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
-; VBITS_EQ_256-LABEL: masked_gather_v8i16:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: mov x8, #4
-; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: ld1h { z0.d }, p0/z, [z0.d]
-; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [z1.d]
-; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0]
-; VBITS_EQ_256-NEXT: str q1, [x0]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: masked_gather_v8i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [z0.d]
+; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [z1.d]
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT: str q1, [x0]
+; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_gather_v8i16:
; VBITS_GE_512: // %bb.0:
@@ -193,17 +182,17 @@ define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
ret void
}
-define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_gather_v16i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [z0.d]
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%ptrs = load <16 x i16*>, <16 x i16*>* %b
%vals = call <16 x i16> @llvm.masked.gather.v16i16(<16 x i16*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i16> undef)
@@ -211,17 +200,17 @@ define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
ret void
}
-define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_gather_v32i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [z0.d]
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%ptrs = load <32 x i16*>, <32 x i16*>* %b
%vals = call <32 x i16> @llvm.masked.gather.v32i16(<32 x i16*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
@@ -235,7 +224,7 @@ define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
; LD1W
;
-define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
+define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x1]
@@ -250,7 +239,7 @@ define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
ret void
}
-define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
+define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -266,21 +255,21 @@ define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
}
define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
-; VBITS_EQ_256-LABEL: masked_gather_v8i32:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: mov x8, #4
-; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: ld1w { z0.d }, p0/z, [z0.d]
-; VBITS_EQ_256-NEXT: ld1w { z1.d }, p0/z, [z1.d]
-; VBITS_EQ_256-NEXT: ptrue p0.s, vl4
-; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_EQ_256-NEXT: splice z1.s, p0, z1.s, z0.s
-; VBITS_EQ_256-NEXT: ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: masked_gather_v8i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [z0.d]
+; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [z1.d]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl4
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_gather_v8i32:
; VBITS_GE_512: // %bb.0:
@@ -297,16 +286,16 @@ define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
ret void
}
-define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_gather_v16i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [z0.d]
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%ptrs = load <16 x i32*>, <16 x i32*>* %b
%vals = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
@@ -314,16 +303,16 @@ define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
ret void
}
-define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_gather_v32i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [z0.d]
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%ptrs = load <32 x i32*>, <32 x i32*>* %b
%vals = call <32 x i32> @llvm.masked.gather.v32i32(<32 x i32*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
@@ -337,7 +326,7 @@ define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
; LD1D
;
-define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
+define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x1]
@@ -351,7 +340,7 @@ define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
ret void
}
-define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
+define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -366,17 +355,17 @@ define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
}
define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
-; VBITS_EQ_256-LABEL: masked_gather_v8i64:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: mov x8, #4
-; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [z0.d]
-; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [z1.d]
-; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: masked_gather_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [z0.d]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [z1.d]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_gather_v8i64:
; VBITS_GE_512: // %bb.0:
@@ -391,14 +380,14 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
ret void
}
-define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_gather_v16i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [z0.d]
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%ptrs = load <16 x i64*>, <16 x i64*>* %b
%vals = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i64> undef)
@@ -406,14 +395,14 @@ define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
ret void
}
-define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_gather_v32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [z0.d]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%ptrs = load <32 x i64*>, <32 x i64*>* %b
%vals = call <32 x i64> @llvm.masked.gather.v32i64(<32 x i64*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 0fe797e547b7..af9bf560afca 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@@ -23,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
; LD1B
;
-define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
+define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0]
@@ -48,7 +34,7 @@ define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
ret void
}
-define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
+define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
@@ -74,7 +60,6 @@ define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
}
define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: masked_gather_v8i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ldr d0, [x0]
@@ -129,78 +114,23 @@ define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
ret void
}
-define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ldr q0, [x0]
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: zip2 v1.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT: zip1 v2.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8
-; VBITS_GE_256-NEXT: shl v2.4h, v2.4h, #8
-; VBITS_GE_256-NEXT: zip2 v3.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8
-; VBITS_GE_256-NEXT: sshr v2.4h, v2.4h, #8
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: shl v3.4h, v3.4h, #8
-; VBITS_GE_256-NEXT: shl v0.4h, v0.4h, #8
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT: sshr v1.4h, v3.4h, #8
-; VBITS_GE_256-NEXT: sshr v0.4h, v0.4h, #8
-; VBITS_GE_256-NEXT: cmpne p2.d, p0/z, z2.d, #0
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: ld1b { z2.d }, p1/z, [z6.d]
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0
-; VBITS_GE_256-NEXT: ld1b { z3.d }, p2/z, [z7.d]
-; VBITS_GE_256-NEXT: ld1b { z0.d }, p1/z, [z5.d]
-; VBITS_GE_256-NEXT: ld1b { z1.d }, p0/z, [z4.d]
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0]
-; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
-; VBITS_GE_256-NEXT: uzp1 v0.16b, v3.16b, v1.16b
-; VBITS_GE_256-NEXT: str q0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ldr q0, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: cmeq v0.16b, v0.16b, #0
-; VBITS_GE_1024-NEXT: sunpklo z0.h, z0.b
-; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT: cmpne p0.d, p0/z, z0.d, #0
-; VBITS_GE_1024-NEXT: ld1b { z0.d }, p0/z, [z1.d]
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_1024-NEXT: str q0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmeq v0.16b, v0.16b, #0
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
%cval = load <16 x i8>, <16 x i8>* %a
%ptrs = load <16 x i8*>, <16 x i8*>* %b
%mask = icmp eq <16 x i8> %cval, zeroinitializer
@@ -209,199 +139,23 @@ define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
ret void
}
-define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_256-NEXT: .cfi_def_cfa_offset 16
-; VBITS_GE_256-NEXT: mov x29, sp
-; VBITS_GE_256-NEXT: .cfi_def_cfa w29, 16
-; VBITS_GE_256-NEXT: .cfi_offset w30, -8
-; VBITS_GE_256-NEXT: .cfi_offset w29, -16
-; VBITS_GE_256-NEXT: sub x9, sp, #48
-; VBITS_GE_256-NEXT: and sp, x9, #0xffffffffffffffe0
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, #0
-; VBITS_GE_256-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: zip2 v2.8b, v4.8b, v0.8b
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: shl v3.4h, v2.4h, #8
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: sshr v5.4h, v3.4h, #8
-; VBITS_GE_256-NEXT: mov x8, #20
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: ld1b { z5.d }, p2/z, [z7.d]
-; VBITS_GE_256-NEXT: zip1 v7.8b, v4.8b, v0.8b
-; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT: shl v7.4h, v7.4h, #8
-; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z5.h
-; VBITS_GE_256-NEXT: umov w8, v5.h[3]
-; VBITS_GE_256-NEXT: umov w9, v5.h[2]
-; VBITS_GE_256-NEXT: umov w10, v5.h[1]
-; VBITS_GE_256-NEXT: sshr v7.4h, v7.4h, #8
-; VBITS_GE_256-NEXT: umov w11, v5.h[0]
-; VBITS_GE_256-NEXT: mov z5.d, z4.d
-; VBITS_GE_256-NEXT: sunpklo z7.s, z7.h
-; VBITS_GE_256-NEXT: ext z5.b, z5.b, z4.b, #16
-; VBITS_GE_256-NEXT: sunpklo z7.d, z7.s
-; VBITS_GE_256-NEXT: strb w8, [sp, #7]
-; VBITS_GE_256-NEXT: strb w9, [sp, #6]
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z7.d, #0
-; VBITS_GE_256-NEXT: strb w10, [sp, #5]
-; VBITS_GE_256-NEXT: strb w11, [sp, #4]
-; VBITS_GE_256-NEXT: ld1b { z7.d }, p2/z, [z17.d]
-; VBITS_GE_256-NEXT: zip2 v17.8b, v5.8b, v0.8b
-; VBITS_GE_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8
-; VBITS_GE_256-NEXT: uzp1 z7.s, z7.s, z7.s
-; VBITS_GE_256-NEXT: shl v17.4h, v17.4h, #8
-; VBITS_GE_256-NEXT: uzp1 z7.h, z7.h, z7.h
-; VBITS_GE_256-NEXT: umov w8, v7.h[3]
-; VBITS_GE_256-NEXT: umov w9, v7.h[2]
-; VBITS_GE_256-NEXT: umov w10, v7.h[1]
-; VBITS_GE_256-NEXT: sshr v17.4h, v17.4h, #8
-; VBITS_GE_256-NEXT: umov w11, v7.h[0]
-; VBITS_GE_256-NEXT: sunpklo z7.s, z17.h
-; VBITS_GE_256-NEXT: sunpklo z7.d, z7.s
-; VBITS_GE_256-NEXT: strb w8, [sp, #3]
-; VBITS_GE_256-NEXT: strb w9, [sp, #2]
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z7.d, #0
-; VBITS_GE_256-NEXT: strb w10, [sp, #1]
-; VBITS_GE_256-NEXT: strb w11, [sp]
-; VBITS_GE_256-NEXT: ld1b { z7.d }, p2/z, [z16.d]
-; VBITS_GE_256-NEXT: zip1 v16.8b, v5.8b, v0.8b
-; VBITS_GE_256-NEXT: uzp1 z7.s, z7.s, z7.s
-; VBITS_GE_256-NEXT: shl v16.4h, v16.4h, #8
-; VBITS_GE_256-NEXT: uzp1 z7.h, z7.h, z7.h
-; VBITS_GE_256-NEXT: umov w8, v7.h[3]
-; VBITS_GE_256-NEXT: umov w9, v7.h[2]
-; VBITS_GE_256-NEXT: umov w10, v7.h[1]
-; VBITS_GE_256-NEXT: sshr v16.4h, v16.4h, #8
-; VBITS_GE_256-NEXT: umov w11, v7.h[0]
-; VBITS_GE_256-NEXT: sunpklo z7.s, z16.h
-; VBITS_GE_256-NEXT: sunpklo z7.d, z7.s
-; VBITS_GE_256-NEXT: strb w8, [sp, #23]
-; VBITS_GE_256-NEXT: strb w9, [sp, #22]
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z7.d, #0
-; VBITS_GE_256-NEXT: strb w10, [sp, #21]
-; VBITS_GE_256-NEXT: zip2 v7.8b, v4.8b, v0.8b
-; VBITS_GE_256-NEXT: strb w11, [sp, #20]
-; VBITS_GE_256-NEXT: zip1 v4.8b, v4.8b, v0.8b
-; VBITS_GE_256-NEXT: ld1b { z6.d }, p2/z, [z6.d]
-; VBITS_GE_256-NEXT: shl v7.4h, v7.4h, #8
-; VBITS_GE_256-NEXT: shl v4.4h, v4.4h, #8
-; VBITS_GE_256-NEXT: uzp1 z6.s, z6.s, z6.s
-; VBITS_GE_256-NEXT: sshr v7.4h, v7.4h, #8
-; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z6.h
-; VBITS_GE_256-NEXT: sshr v4.4h, v4.4h, #8
-; VBITS_GE_256-NEXT: umov w8, v6.h[3]
-; VBITS_GE_256-NEXT: umov w9, v6.h[2]
-; VBITS_GE_256-NEXT: umov w10, v6.h[1]
-; VBITS_GE_256-NEXT: umov w11, v6.h[0]
-; VBITS_GE_256-NEXT: sunpklo z6.s, z7.h
-; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s
-; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT: strb w8, [sp, #19]
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT: strb w9, [sp, #18]
-; VBITS_GE_256-NEXT: strb w10, [sp, #17]
-; VBITS_GE_256-NEXT: strb w11, [sp, #16]
-; VBITS_GE_256-NEXT: ld1b { z3.d }, p2/z, [z3.d]
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: umov w8, v3.h[3]
-; VBITS_GE_256-NEXT: umov w9, v3.h[2]
-; VBITS_GE_256-NEXT: umov w10, v3.h[1]
-; VBITS_GE_256-NEXT: umov w11, v3.h[0]
-; VBITS_GE_256-NEXT: ext v3.16b, v5.16b, v5.16b, #8
-; VBITS_GE_256-NEXT: strb w8, [sp, #15]
-; VBITS_GE_256-NEXT: strb w9, [sp, #14]
-; VBITS_GE_256-NEXT: strb w10, [sp, #13]
-; VBITS_GE_256-NEXT: zip2 v4.8b, v3.8b, v0.8b
-; VBITS_GE_256-NEXT: strb w11, [sp, #12]
-; VBITS_GE_256-NEXT: ld1b { z2.d }, p2/z, [z2.d]
-; VBITS_GE_256-NEXT: shl v4.4h, v4.4h, #8
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: sshr v4.4h, v4.4h, #8
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: umov w8, v2.h[3]
-; VBITS_GE_256-NEXT: umov w9, v2.h[2]
-; VBITS_GE_256-NEXT: umov w10, v2.h[1]
-; VBITS_GE_256-NEXT: umov w11, v2.h[0]
-; VBITS_GE_256-NEXT: sunpklo z2.s, z4.h
-; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: strb w8, [sp, #11]
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT: strb w9, [sp, #10]
-; VBITS_GE_256-NEXT: zip1 v2.8b, v3.8b, v0.8b
-; VBITS_GE_256-NEXT: strb w10, [sp, #9]
-; VBITS_GE_256-NEXT: strb w11, [sp, #8]
-; VBITS_GE_256-NEXT: ld1b { z1.d }, p2/z, [z1.d]
-; VBITS_GE_256-NEXT: shl v2.4h, v2.4h, #8
-; VBITS_GE_256-NEXT: sshr v2.4h, v2.4h, #8
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: umov w8, v1.h[3]
-; VBITS_GE_256-NEXT: umov w9, v1.h[2]
-; VBITS_GE_256-NEXT: umov w10, v1.h[1]
-; VBITS_GE_256-NEXT: umov w11, v1.h[0]
-; VBITS_GE_256-NEXT: sunpklo z1.s, z2.h
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: strb w8, [sp, #31]
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT: strb w9, [sp, #30]
-; VBITS_GE_256-NEXT: strb w10, [sp, #29]
-; VBITS_GE_256-NEXT: strb w11, [sp, #28]
-; VBITS_GE_256-NEXT: ld1b { z0.d }, p1/z, [z0.d]
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: umov w8, v0.h[3]
-; VBITS_GE_256-NEXT: umov w9, v0.h[2]
-; VBITS_GE_256-NEXT: umov w10, v0.h[1]
-; VBITS_GE_256-NEXT: umov w11, v0.h[0]
-; VBITS_GE_256-NEXT: strb w8, [sp, #27]
-; VBITS_GE_256-NEXT: strb w9, [sp, #26]
-; VBITS_GE_256-NEXT: strb w10, [sp, #25]
-; VBITS_GE_256-NEXT: strb w11, [sp, #24]
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [sp]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_256-NEXT: mov sp, x29
-; VBITS_GE_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32i8:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.b, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.b, p0/z, z0.b, #0
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: ld1b { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, #0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1b { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <32 x i8>, <32 x i8>* %a
%ptrs = load <32 x i8*>, <32 x i8*>* %b
%mask = icmp eq <32 x i8> %cval, zeroinitializer
@@ -414,7 +168,7 @@ define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
; LD1H
;
-define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
+define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0]
@@ -439,7 +193,7 @@ define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
ret void
}
-define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
+define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -463,7 +217,6 @@ define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
}
define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: masked_gather_v8i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ldr q0, [x0]
@@ -511,69 +264,21 @@ define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
ret void
}
-define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z0.h, #0
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: sunpklo z4.s, z2.h
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: ext v5.16b, v2.16b, v2.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0
-; VBITS_GE_256-NEXT: ext v4.16b, v2.16b, v2.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: ld1h { z6.d }, p2/z, [z6.d]
-; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT: ld1h { z3.d }, p2/z, [z3.d]
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT: sunpklo z2.d, z4.s
-; VBITS_GE_256-NEXT: ld1h { z0.d }, p2/z, [z0.d]
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT: uzp1 z2.s, z6.s, z6.s
-; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z1.d]
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: ptrue p1.h, vl8
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: mov v2.d[1], v3.d[0]
-; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0]
-; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z0.h
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq p1.h, p0/z, z0.h, #0
-; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT: ld1h { z0.d }, p1/z, [z1.d]
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ptrue p1.d, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1h { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <16 x i16>, <16 x i16>* %a
%ptrs = load <16 x i16*>, <16 x i16*>* %b
%mask = icmp eq <16 x i16> %cval, zeroinitializer
@@ -582,111 +287,21 @@ define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
ret void
}
-define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x11, #4
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z3.h, #0
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: sunpklo z18.s, z3.h
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ext v5.16b, v3.16b, v3.16b, #8
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z18.d, #0
-; VBITS_GE_256-NEXT: ext v18.16b, v3.16b, v3.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: sunpklo z18.s, z18.h
-; VBITS_GE_256-NEXT: ld1h { z17.d }, p2/z, [z17.d]
-; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z4.h, #0
-; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s
-; VBITS_GE_256-NEXT: ld1h { z4.d }, p3/z, [z16.d]
-; VBITS_GE_256-NEXT: mov z16.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z18.d, #0
-; VBITS_GE_256-NEXT: ld1h { z3.d }, p2/z, [z7.d]
-; VBITS_GE_256-NEXT: ld1h { z6.d }, p3/z, [z6.d]
-; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT: uzp1 z7.h, z17.h, z17.h
-; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: uzp1 z6.s, z6.s, z6.s
-; VBITS_GE_256-NEXT: mov v7.d[1], v4.d[0]
-; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: ext v4.16b, v16.16b, v16.16b, #8
-; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z6.h
-; VBITS_GE_256-NEXT: mov v3.d[1], v6.d[0]
-; VBITS_GE_256-NEXT: sunpklo z6.s, z16.h
-; VBITS_GE_256-NEXT: ext z16.b, z16.b, z16.b, #16
-; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s
-; VBITS_GE_256-NEXT: ext v17.16b, v16.16b, v16.16b, #8
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z4.d, #0
-; VBITS_GE_256-NEXT: ld1h { z4.d }, p2/z, [z5.d]
-; VBITS_GE_256-NEXT: sunpklo z5.s, z16.h
-; VBITS_GE_256-NEXT: sunpklo z6.s, z17.h
-; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT: ld1h { z2.d }, p3/z, [z2.d]
-; VBITS_GE_256-NEXT: ld1h { z1.d }, p2/z, [z1.d]
-; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z0.d]
-; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT: ptrue p1.h, vl8
-; VBITS_GE_256-NEXT: splice z7.h, p1, z7.h, z3.h
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: mov v4.d[1], v2.d[0]
-; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
-; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z1.h
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.h, p0/z, z0.h, #0
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1h { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <32 x i16>, <32 x i16>* %a
%ptrs = load <32 x i16*>, <32 x i16*>* %b
%mask = icmp eq <32 x i16> %cval, zeroinitializer
@@ -699,7 +314,7 @@ define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
; LD1W
;
-define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
+define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -720,7 +335,7 @@ define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
ret void
}
-define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
+define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -742,7 +357,6 @@ define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
}
define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: masked_gather_v8i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
@@ -787,61 +401,19 @@ define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
ret void
}
-define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: mov x10, #12
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0
-; VBITS_GE_256-NEXT: cmpeq p3.s, p0/z, z1.s, #0
-; VBITS_GE_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ld1w { z4.d }, p2/z, [z4.d]
-; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z0.d, #0
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT: ld1w { z0.d }, p3/z, [z3.d]
-; VBITS_GE_256-NEXT: ld1w { z1.d }, p2/z, [z5.d]
-; VBITS_GE_256-NEXT: ld1w { z2.d }, p1/z, [z2.d]
-; VBITS_GE_256-NEXT: ptrue p1.s, vl4
-; VBITS_GE_256-NEXT: uzp1 z3.s, z4.s, z4.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z0.s
-; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z2.s
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT: ld1w { z0.d }, p1/z, [z1.d]
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ptrue p1.d, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <16 x i32>, <16 x i32>* %a
%ptrs = load <16 x i32*>, <16 x i32*>* %b
%mask = icmp eq <16 x i32> %cval, zeroinitializer
@@ -850,97 +422,19 @@ define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
ret void
}
-define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x14, #28
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x11, #4
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x12, #20
-; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0
-; VBITS_GE_256-NEXT: mov x13, #12
-; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: cmpeq p4.s, p0/z, z2.s, #0
-; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z18.d]
-; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [z17.d]
-; VBITS_GE_256-NEXT: cmpeq p3.s, p0/z, z1.s, #0
-; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z3.s, #0
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT: punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [z16.d]
-; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [z6.d]
-; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [z7.d]
-; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [z5.d]
-; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [z0.d]
-; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [z4.d]
-; VBITS_GE_256-NEXT: ptrue p3.s, vl4
-; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <32 x i32>, <32 x i32>* %a
%ptrs = load <32 x i32*>, <32 x i32*>* %b
%mask = icmp eq <32 x i32> %cval, zeroinitializer
@@ -954,7 +448,7 @@ define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
;
; Scalarize 1 x i64 gathers
-define void @masked_gather_v1i64(<1 x i64>* %a, <1 x i64*>* %b) #0 {
+define void @masked_gather_v1i64(<1 x i64>* %a, <1 x i64*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -976,7 +470,7 @@ define void @masked_gather_v1i64(<1 x i64>* %a, <1 x i64*>* %b) #0 {
ret void
}
-define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
+define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -995,7 +489,7 @@ define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
ret void
}
-define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
+define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -1014,7 +508,6 @@ define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
}
define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: masked_gather_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
@@ -1040,7 +533,6 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [z1.d]
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
-
%cval = load <8 x i64>, <8 x i64>* %a
%ptrs = load <8 x i64*>, <8 x i64*>* %b
%mask = icmp eq <8 x i64> %cval, zeroinitializer
@@ -1049,44 +541,16 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
ret void
}
-define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: mov x10, #12
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_256-NEXT: cmpeq p3.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, #0
-; VBITS_GE_256-NEXT: cmpeq p4.d, p0/z, z3.d, #0
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p3/z, [z4.d]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z6.d]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [z5.d]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p4/z, [z7.d]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p1/z, [z1.d]
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <16 x i64>, <16 x i64>* %a
%ptrs = load <16 x i64*>, <16 x i64*>* %b
%mask = icmp eq <16 x i64> %cval, zeroinitializer
@@ -1095,68 +559,16 @@ define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
ret void
}
-define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #12
-; VBITS_GE_256-NEXT: mov x11, #16
-; VBITS_GE_256-NEXT: mov x12, #20
-; VBITS_GE_256-NEXT: mov x13, #24
-; VBITS_GE_256-NEXT: mov x14, #28
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z0.d, #0
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [z19.d]
-; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z6.d, #0
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z21.d]
-; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, #0
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [z22.d]
-; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z3.d, #0
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p2/z, [z20.d]
-; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z4.d, #0
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p2/z, [z18.d]
-; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z5.d, #0
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p2/z, [z17.d]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [z16.d]
-; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z7.d, #0
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [z23.d]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <32 x i64>, <32 x i64>* %a
%ptrs = load <32 x i64*>, <32 x i64*>* %b
%mask = icmp eq <32 x i64> %cval, zeroinitializer
@@ -1169,7 +581,7 @@ define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 {
; LD1H (float)
;
-define void @masked_gather_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 {
+define void @masked_gather_v2f16(<2 x half>* %a, <2 x half*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s1, [x0]
@@ -1205,7 +617,7 @@ define void @masked_gather_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 {
ret void
}
-define void @masked_gather_v4f16(<4 x half>* %a, <4 x half*>* %b) #0 {
+define void @masked_gather_v4f16(<4 x half>* %a, <4 x half*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -1276,69 +688,21 @@ define void @masked_gather_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 {
ret void
}
-define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: sunpklo z4.s, z2.h
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: ext v5.16b, v2.16b, v2.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0
-; VBITS_GE_256-NEXT: ext v4.16b, v2.16b, v2.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT: ld1h { z6.d }, p2/z, [z6.d]
-; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT: ld1h { z3.d }, p2/z, [z3.d]
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT: sunpklo z2.d, z4.s
-; VBITS_GE_256-NEXT: ld1h { z0.d }, p2/z, [z0.d]
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT: uzp1 z2.s, z6.s, z6.s
-; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z1.d]
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: ptrue p1.h, vl8
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: mov v2.d[1], v3.d[0]
-; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0]
-; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z0.h
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0
-; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT: ld1h { z0.d }, p1/z, [z1.d]
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ptrue p1.d, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1h { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <16 x half>, <16 x half>* %a
%ptrs = load <16 x half*>, <16 x half*>* %b
%mask = fcmp oeq <16 x half> %cval, zeroinitializer
@@ -1347,111 +711,21 @@ define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 {
ret void
}
-define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x11, #4
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, #0.0
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: sunpklo z18.s, z3.h
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ext v5.16b, v3.16b, v3.16b, #8
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z18.d, #0
-; VBITS_GE_256-NEXT: ext v18.16b, v3.16b, v3.16b, #8
-; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: sunpklo z18.s, z18.h
-; VBITS_GE_256-NEXT: ld1h { z17.d }, p2/z, [z17.d]
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z4.h, #0.0
-; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s
-; VBITS_GE_256-NEXT: ld1h { z4.d }, p3/z, [z16.d]
-; VBITS_GE_256-NEXT: mov z16.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z18.d, #0
-; VBITS_GE_256-NEXT: ld1h { z3.d }, p2/z, [z7.d]
-; VBITS_GE_256-NEXT: ld1h { z6.d }, p3/z, [z6.d]
-; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT: uzp1 z7.h, z17.h, z17.h
-; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: uzp1 z6.s, z6.s, z6.s
-; VBITS_GE_256-NEXT: mov v7.d[1], v4.d[0]
-; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT: ext v4.16b, v16.16b, v16.16b, #8
-; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z6.h
-; VBITS_GE_256-NEXT: mov v3.d[1], v6.d[0]
-; VBITS_GE_256-NEXT: sunpklo z6.s, z16.h
-; VBITS_GE_256-NEXT: ext z16.b, z16.b, z16.b, #16
-; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s
-; VBITS_GE_256-NEXT: ext v17.16b, v16.16b, v16.16b, #8
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z4.d, #0
-; VBITS_GE_256-NEXT: ld1h { z4.d }, p2/z, [z5.d]
-; VBITS_GE_256-NEXT: sunpklo z5.s, z16.h
-; VBITS_GE_256-NEXT: sunpklo z6.s, z17.h
-; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT: ld1h { z2.d }, p3/z, [z2.d]
-; VBITS_GE_256-NEXT: ld1h { z1.d }, p2/z, [z1.d]
-; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z0.d]
-; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT: ptrue p1.h, vl8
-; VBITS_GE_256-NEXT: splice z7.h, p1, z7.h, z3.h
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: mov v4.d[1], v2.d[0]
-; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
-; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z1.h
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1h { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <32 x half>, <32 x half>* %a
%ptrs = load <32 x half*>, <32 x half*>* %b
%mask = fcmp oeq <32 x half> %cval, zeroinitializer
@@ -1464,7 +738,7 @@ define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 {
; LD1W (float)
;
-define void @masked_gather_v2f32(<2 x float>* %a, <2 x float*>* %b) #0 {
+define void @masked_gather_v2f32(<2 x float>* %a, <2 x float*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -1485,7 +759,7 @@ define void @masked_gather_v2f32(<2 x float>* %a, <2 x float*>* %b) #0 {
ret void
}
-define void @masked_gather_v4f32(<4 x float>* %a, <4 x float*>* %b) #0 {
+define void @masked_gather_v4f32(<4 x float>* %a, <4 x float*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -1551,61 +825,19 @@ define void @masked_gather_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 {
ret void
}
-define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: mov x10, #12
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ld1w { z4.d }, p2/z, [z4.d]
-; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z0.d, #0
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT: ld1w { z0.d }, p3/z, [z3.d]
-; VBITS_GE_256-NEXT: ld1w { z1.d }, p2/z, [z5.d]
-; VBITS_GE_256-NEXT: ld1w { z2.d }, p1/z, [z2.d]
-; VBITS_GE_256-NEXT: ptrue p1.s, vl4
-; VBITS_GE_256-NEXT: uzp1 z3.s, z4.s, z4.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z0.s
-; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z2.s
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT: ld1w { z0.d }, p1/z, [z1.d]
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ptrue p1.d, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <16 x float>, <16 x float>* %a
%ptrs = load <16 x float*>, <16 x float*>* %b
%mask = fcmp oeq <16 x float> %cval, zeroinitializer
@@ -1614,97 +846,19 @@ define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 {
ret void
}
-define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x14, #28
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x11, #4
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x12, #20
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT: mov x13, #12
-; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z18.d]
-; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [z17.d]
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT: punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [z16.d]
-; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [z6.d]
-; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [z7.d]
-; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [z5.d]
-; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [z0.d]
-; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [z4.d]
-; VBITS_GE_256-NEXT: ptrue p3.s, vl4
-; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <32 x float>, <32 x float>* %a
%ptrs = load <32 x float*>, <32 x float*>* %b
%mask = fcmp oeq <32 x float> %cval, zeroinitializer
@@ -1718,7 +872,7 @@ define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 {
;
; Scalarize 1 x double gathers
-define void @masked_gather_v1f64(<1 x double>* %a, <1 x double*>* %b) #0 {
+define void @masked_gather_v1f64(<1 x double>* %a, <1 x double*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -1740,7 +894,7 @@ define void @masked_gather_v1f64(<1 x double>* %a, <1 x double*>* %b) #0 {
ret void
}
-define void @masked_gather_v2f64(<2 x double>* %a, <2 x double*>* %b) #0 {
+define void @masked_gather_v2f64(<2 x double>* %a, <2 x double*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -1759,7 +913,7 @@ define void @masked_gather_v2f64(<2 x double>* %a, <2 x double*>* %b) #0 {
ret void
}
-define void @masked_gather_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 {
+define void @masked_gather_v4f64(<4 x double>* %a, <4 x double*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -1811,44 +965,16 @@ define void @masked_gather_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 {
ret void
}
-define void @masked_gather_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: mov x10, #12
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
-; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z1.d, #0.0
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0
-; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, #0.0
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p3/z, [z4.d]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z6.d]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [z5.d]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p4/z, [z7.d]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p1/z, [z1.d]
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_v16f64(<16 x double>* %a, <16 x double*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <16 x double>, <16 x double>* %a
%ptrs = load <16 x double*>, <16 x double*>* %b
%mask = fcmp oeq <16 x double> %cval, zeroinitializer
@@ -1857,68 +983,16 @@ define void @masked_gather_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 {
ret void
}
-define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #12
-; VBITS_GE_256-NEXT: mov x11, #16
-; VBITS_GE_256-NEXT: mov x12, #20
-; VBITS_GE_256-NEXT: mov x13, #24
-; VBITS_GE_256-NEXT: mov x14, #28
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z0.d, #0.0
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [z19.d]
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, #0.0
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z21.d]
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [z22.d]
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, #0.0
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p2/z, [z20.d]
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z4.d, #0.0
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p2/z, [z18.d]
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, #0.0
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p2/z, [z17.d]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [z16.d]
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z7.d, #0.0
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [z23.d]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%cval = load <32 x double>, <32 x double>* %a
%ptrs = load <32 x double*>, <32 x double*>* %b
%mask = fcmp oeq <32 x double> %cval, zeroinitializer
@@ -1930,61 +1004,19 @@ define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
; The above tests test the types, the below tests check that the addressing
; modes still function
-define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_32b_scaled_sext_f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ptrue p1.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p1/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0
-; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0
-; VBITS_GE_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, sxtw #1]
-; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z0.s, #0
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, sxtw #1]
-; VBITS_GE_256-NEXT: punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0
-; VBITS_GE_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, sxtw #1]
-; VBITS_GE_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, sxtw #1]
-; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: ptrue p1.h, vl8
-; VBITS_GE_256-NEXT: splice z3.h, p1, z3.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_sext_f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: ptrue p1.s, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0
-; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1]
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_32b_scaled_sext_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%cvals = load <32 x half>, <32 x half>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = sext <32 x i32> %idxs to <32 x i64>
@@ -1995,44 +1027,16 @@ define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
ret void
}
-define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_32b_scaled_sext_f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p3/z, [x2, z4.s, sxtw #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x2, z6.s, sxtw #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x2, z5.s, sxtw #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p4/z, [x2, z7.s, sxtw #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_sext_f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2]
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_32b_scaled_sext_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2]
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%cvals = load <32 x float>, <32 x float>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = sext <32 x i32> %idxs to <32 x i64>
@@ -2043,77 +1047,16 @@ define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b,
ret void
}
-define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_32b_scaled_sext_f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: mov x12, #24
-; VBITS_GE_256-NEXT: mov x13, #28
-; VBITS_GE_256-NEXT: mov x14, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ptrue p1.s, vl8
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z18.s }, p1/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z17.s }, p1/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z16.s }, p1/z, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z19.s }, p1/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z0.d, #0.0
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z5.d, #0.0
-; VBITS_GE_256-NEXT: sunpklo z22.d, z18.s
-; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s
-; VBITS_GE_256-NEXT: sunpklo z21.d, z17.s
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [x2, z18.d, lsl #3]
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0
-; VBITS_GE_256-NEXT: ext z17.b, z17.b, z17.b, #16
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x2, z22.d, lsl #3]
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0
-; VBITS_GE_256-NEXT: sunpklo z20.d, z16.s
-; VBITS_GE_256-NEXT: ext z16.b, z16.b, z16.b, #16
-; VBITS_GE_256-NEXT: sunpklo z17.d, z17.s
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x2, z21.d, lsl #3]
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, #0.0
-; VBITS_GE_256-NEXT: sunpklo z16.d, z16.s
-; VBITS_GE_256-NEXT: sunpklo z23.d, z19.s
-; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p2/z, [x2, z17.d, lsl #3]
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z4.d, #0.0
-; VBITS_GE_256-NEXT: sunpklo z19.d, z19.s
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p2/z, [x2, z20.d, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x2, z16.d, lsl #3]
-; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, #0.0
-; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z7.d, #0.0
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p2/z, [x2, z23.d, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x2, z19.d, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p1/z, [x2, z1.d, lsl #3]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_32b_scaled_sext_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [x2, z1.d, lsl #3]
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%cvals = load <32 x double>, <32 x double>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = sext <32 x i32> %idxs to <32 x i64>
@@ -2124,61 +1067,19 @@ define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b
ret void
}
-define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_32b_scaled_zext:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ptrue p1.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p1/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0
-; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0
-; VBITS_GE_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, uxtw #1]
-; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z0.s, #0
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, uxtw #1]
-; VBITS_GE_256-NEXT: punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0
-; VBITS_GE_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, uxtw #1]
-; VBITS_GE_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, uxtw #1]
-; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: ptrue p1.h, vl8
-; VBITS_GE_256-NEXT: splice z3.h, p1, z3.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_zext:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: ptrue p1.s, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0
-; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1]
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_32b_scaled_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%cvals = load <32 x half>, <32 x half>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = zext <32 x i32> %idxs to <32 x i64>
@@ -2189,61 +1090,19 @@ define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half
ret void
}
-define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_32b_unscaled_sext:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ptrue p1.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p1/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0
-; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0
-; VBITS_GE_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, sxtw]
-; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z0.s, #0
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, sxtw]
-; VBITS_GE_256-NEXT: punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0
-; VBITS_GE_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, sxtw]
-; VBITS_GE_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, sxtw]
-; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: ptrue p1.h, vl8
-; VBITS_GE_256-NEXT: splice z3.h, p1, z3.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_32b_unscaled_sext:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: ptrue p1.s, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0
-; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw]
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_32b_unscaled_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%cvals = load <32 x half>, <32 x half>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = sext <32 x i32> %idxs to <32 x i64>
@@ -2255,61 +1114,19 @@ define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8
ret void
}
-define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_32b_unscaled_zext:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x10, #24
-; VBITS_GE_256-NEXT: ptrue p1.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p1/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0
-; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0
-; VBITS_GE_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, uxtw]
-; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z0.s, #0
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, uxtw]
-; VBITS_GE_256-NEXT: punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0
-; VBITS_GE_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, uxtw]
-; VBITS_GE_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, uxtw]
-; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT: ptrue p1.h, vl8
-; VBITS_GE_256-NEXT: splice z3.h, p1, z3.h, z0.h
-; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_32b_unscaled_zext:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: ptrue p1.s, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0
-; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw]
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_32b_unscaled_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%cvals = load <32 x half>, <32 x half>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = zext <32 x i32> %idxs to <32 x i64>
@@ -2321,97 +1138,19 @@ define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8
ret void
}
-define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_64b_scaled:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x14, #28
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x11, #4
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x12, #20
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT: mov x13, #12
-; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d, lsl #2]
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT: punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d, lsl #2]
-; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d, lsl #2]
-; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d, lsl #2]
-; VBITS_GE_256-NEXT: ptrue p3.s, vl4
-; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_64b_scaled:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2]
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_64b_scaled:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%cvals = load <32 x float>, <32 x float>* %a
%idxs = load <32 x i64>, <32 x i64>* %b
%ptrs = getelementptr float, float* %base, <32 x i64> %idxs
@@ -2421,97 +1160,19 @@ define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %
ret void
}
-define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_64b_unscaled:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x14, #28
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x11, #4
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x12, #20
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT: mov x13, #12
-; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d]
-; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d]
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT: punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d]
-; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d]
-; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d]
-; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d]
-; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d]
-; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d]
-; VBITS_GE_256-NEXT: ptrue p3.s, vl4
-; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_64b_unscaled:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d]
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_64b_unscaled:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%cvals = load <32 x float>, <32 x float>* %a
%idxs = load <32 x i64>, <32 x i64>* %b
%byte_ptrs = getelementptr i8, i8* %base, <32 x i64> %idxs
@@ -2522,97 +1183,19 @@ define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %b
ret void
}
-define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) #0 {
-; VBITS_GE_256-LABEL: masked_gather_vec_plus_reg:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x14, #28
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x11, #4
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x12, #20
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT: mov x13, #12
-; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d]
-; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d]
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT: punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d]
-; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d]
-; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d]
-; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d]
-; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d]
-; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d]
-; VBITS_GE_256-NEXT: ptrue p3.s, vl4
-; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_vec_plus_reg:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d]
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_vec_plus_reg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%cvals = load <32 x float>, <32 x float>* %a
%bases = load <32 x i8*>, <32 x i8*>* %b
%byte_ptrs = getelementptr i8, <32 x i8*> %bases, i64 %off
@@ -2623,97 +1206,19 @@ define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %o
ret void
}
-define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_vec_plus_imm:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x14, #28
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x11, #4
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x12, #20
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT: mov x13, #12
-; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z18.d, #4]
-; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [z17.d, #4]
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT: punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [z16.d, #4]
-; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [z6.d, #4]
-; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [z7.d, #4]
-; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [z5.d, #4]
-; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [z0.d, #4]
-; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [z4.d, #4]
-; VBITS_GE_256-NEXT: ptrue p3.s, vl4
-; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_vec_plus_imm:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d, #4]
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_vec_plus_imm:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [z1.d, #4]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%cvals = load <32 x float>, <32 x float>* %a
%bases = load <32 x i8*>, <32 x i8*>* %b
%byte_ptrs = getelementptr i8, <32 x i8*> %bases, i64 4
@@ -2724,115 +1229,21 @@ define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
ret void
}
-define void @masked_gather_passthru(<32 x float>* %a, <32 x float*>* %b, <32 x float>* %c) #0 {
-; VBITS_GE_256-LABEL: masked_gather_passthru:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: mov x11, #4
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x12, #20
-; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z23.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x13, #12
-; VBITS_GE_256-NEXT: mov x14, #28
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z4.s, #0.0
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: punpklo p3.h, p2.b
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT: ld1d { z19.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z21.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z22.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x2, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x2, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x2]
-; VBITS_GE_256-NEXT: ld1w { z4.d }, p3/z, [z23.d]
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z17.s, #0.0
-; VBITS_GE_256-NEXT: mov z17.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT: bif v4.16b, v16.16b, v17.16b
-; VBITS_GE_256-NEXT: ext z17.b, z17.b, z17.b, #16
-; VBITS_GE_256-NEXT: sunpklo z23.d, z17.s
-; VBITS_GE_256-NEXT: ext z16.b, z16.b, z16.b, #16
-; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z23.d, #0
-; VBITS_GE_256-NEXT: ld1w { z22.d }, p4/z, [z22.d]
-; VBITS_GE_256-NEXT: ld1w { z21.d }, p2/z, [z21.d]
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z20.s, #0.0
-; VBITS_GE_256-NEXT: mov z20.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p3.h, p2.b
-; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT: uzp1 z21.s, z21.s, z21.s
-; VBITS_GE_256-NEXT: uzp1 z22.s, z22.s, z22.s
-; VBITS_GE_256-NEXT: bif v21.16b, v5.16b, v20.16b
-; VBITS_GE_256-NEXT: ext z20.b, z20.b, z20.b, #16
-; VBITS_GE_256-NEXT: sunpklo z23.d, z20.s
-; VBITS_GE_256-NEXT: ext z5.b, z5.b, z5.b, #16
-; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z23.d, #0
-; VBITS_GE_256-NEXT: ld1w { z19.d }, p4/z, [z19.d]
-; VBITS_GE_256-NEXT: ld1w { z18.d }, p3/z, [z18.d]
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z7.s, #0.0
-; VBITS_GE_256-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: uzp1 z18.s, z18.s, z18.s
-; VBITS_GE_256-NEXT: bif v18.16b, v1.16b, v7.16b
-; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16
-; VBITS_GE_256-NEXT: sunpklo z23.d, z7.s
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z23.d, #0
-; VBITS_GE_256-NEXT: mov z23.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ld1w { z2.d }, p4/z, [z2.d]
-; VBITS_GE_256-NEXT: ld1w { z3.d }, p2/z, [z3.d]
-; VBITS_GE_256-NEXT: bit v16.16b, v22.16b, v17.16b
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: bif v3.16b, v6.16b, v23.16b
-; VBITS_GE_256-NEXT: ext z23.b, z23.b, z23.b, #16
-; VBITS_GE_256-NEXT: sunpklo z17.d, z23.s
-; VBITS_GE_256-NEXT: ext z6.b, z6.b, z6.b, #16
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z17.d, #0
-; VBITS_GE_256-NEXT: uzp1 z17.s, z19.s, z19.s
-; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z0.d]
-; VBITS_GE_256-NEXT: ptrue p1.s, vl4
-; VBITS_GE_256-NEXT: bit v5.16b, v17.16b, v20.16b
-; VBITS_GE_256-NEXT: splice z4.s, p1, z4.s, z16.s
-; VBITS_GE_256-NEXT: bit v1.16b, v2.16b, v7.16b
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: bif v0.16b, v6.16b, v23.16b
-; VBITS_GE_256-NEXT: splice z21.s, p1, z21.s, z5.s
-; VBITS_GE_256-NEXT: splice z18.s, p1, z18.s, z1.s
-; VBITS_GE_256-NEXT: st1w { z21.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z18.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z0.s
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_passthru:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x2]
-; VBITS_GE_2048-NEXT: punpklo p2.h, p1.b
-; VBITS_GE_2048-NEXT: ld1w { z1.d }, p2/z, [z1.d]
-; VBITS_GE_2048-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_2048-NEXT: mov z0.s, p1/m, z1.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_passthru(<32 x float>* %a, <32 x float*>* %b, <32 x float>* %c) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_passthru:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x2]
+; CHECK-NEXT: punpklo p2.h, p1.b
+; CHECK-NEXT: ld1w { z1.d }, p2/z, [z1.d]
+; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT: mov z0.s, p1/m, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%cvals = load <32 x float>, <32 x float>* %a
%ptrs = load <32 x float*>, <32 x float*>* %b
%passthru = load <32 x float>, <32 x float>* %c
@@ -2842,97 +1253,19 @@ define void @masked_gather_passthru(<32 x float>* %a, <32 x float*>* %b, <32 x f
ret void
}
-define void @masked_gather_passthru_0(<32 x float>* %a, <32 x float*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_passthru_0:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x14, #28
-; VBITS_GE_256-NEXT: ptrue p1.d, vl4
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x11, #4
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x12, #20
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT: mov x13, #12
-; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z18.d]
-; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [z17.d]
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT: punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [z16.d]
-; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [z6.d]
-; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT: punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [z7.d]
-; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [z5.d]
-; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [z0.d]
-; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [z4.d]
-; VBITS_GE_256-NEXT: ptrue p3.s, vl4
-; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_passthru_0:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_gather_passthru_0(<32 x float>* %a, <32 x float*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_passthru_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%cvals = load <32 x float>, <32 x float>* %a
%ptrs = load <32 x float*>, <32 x float*>* %b
%mask = fcmp oeq <32 x float> %cvals, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
index f4ce76e9986c..28e442e4cfe2 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -1,28 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
;
; Masked Loads
;
-define <2 x half> @masked_load_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
+
+define <2 x half> @masked_load_v2f16(<2 x half>* %ap, <2 x half>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_load_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s1, [x0]
@@ -53,7 +40,7 @@ define <2 x half> @masked_load_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
ret <2 x half> %load
}
-define <2 x float> @masked_load_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
+define <2 x float> @masked_load_v2f32(<2 x float>* %ap, <2 x float>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_load_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -71,7 +58,7 @@ define <2 x float> @masked_load_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
ret <2 x float> %load
}
-define <4 x float> @masked_load_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
+define <4 x float> @masked_load_v4f32(<4 x float>* %ap, <4 x float>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_load_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -89,7 +76,7 @@ define <4 x float> @masked_load_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
ret <4 x float> %load
}
-define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
+define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_load_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -107,6 +94,22 @@ define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
}
define <16 x float> @masked_load_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s
+; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z3.s
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@@ -123,16 +126,16 @@ define <16 x float> @masked_load_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0
ret <16 x float> %load
}
-define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
-; VBITS_GE_1024-LABEL: masked_load_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p1/z, [x0]
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_1024-NEXT: ret
+define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_load_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%a = load <32 x float>, <32 x float>* %ap
%b = load <32 x float>, <32 x float>* %bp
%mask = fcmp oeq <32 x float> %a, %b
@@ -140,16 +143,16 @@ define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0
ret <32 x float> %load
}
-define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%a = load <64 x float>, <64 x float>* %ap
%b = load <64 x float>, <64 x float>* %bp
%mask = fcmp oeq <64 x float> %a, %b
@@ -158,6 +161,22 @@ define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0
}
define <64 x i8> @masked_load_v64i8(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w9, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x9]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x9]
+; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z2.b
+; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z1.b, z3.b
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0, x9]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p2/z, [x0]
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x8, x9]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_v64i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
@@ -175,6 +194,22 @@ define <64 x i8> @masked_load_v64i8(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
}
define <32 x i16> @masked_load_v32i16(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x9, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h
+; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, z3.h
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x9, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p2/z, [x0]
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
@@ -192,6 +227,22 @@ define <32 x i16> @masked_load_v32i16(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
}
define <16 x i32> @masked_load_v16i32(<16 x i32>* %ap, <16 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s
+; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, z3.s
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@@ -209,6 +260,22 @@ define <16 x i32> @masked_load_v16i32(<16 x i32>* %ap, <16 x i32>* %bp) #0 {
}
define <8 x i64> @masked_load_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d
+; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z3.d
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -226,6 +293,24 @@ define <8 x i64> @masked_load_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 {
}
define <8 x i64> @masked_load_passthru_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_passthru_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d
+; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z3.d
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0]
+; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
+; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_passthru_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -244,6 +329,24 @@ define <8 x i64> @masked_load_passthru_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0
}
define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_passthru_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z2.d
+; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z3.d
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0]
+; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
+; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z3.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_passthru_v8f64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -262,6 +365,21 @@ define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>*
}
define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: mov x9, #16
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
@@ -278,6 +396,24 @@ define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0
}
define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ldr q0, [x1]
+; VBITS_GE_256-NEXT: ptrue p0.b, vl16
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0
+; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@@ -294,6 +430,25 @@ define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0
}
define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ldr d0, [x1]
+; VBITS_GE_256-NEXT: ptrue p0.b, vl8
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0
+; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -310,6 +465,21 @@ define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
}
define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@@ -326,6 +496,24 @@ define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp)
}
define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ldr q0, [x1]
+; VBITS_GE_256-NEXT: ptrue p0.h, vl8
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0
+; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -342,6 +530,21 @@ define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
}
define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -358,6 +561,21 @@ define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
}
define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: mov x9, #16
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
@@ -374,6 +592,24 @@ define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0
}
define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ldr q0, [x1]
+; VBITS_GE_256-NEXT: ptrue p0.b, vl16
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0
+; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@@ -390,6 +626,25 @@ define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0
}
define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ldr d0, [x1]
+; VBITS_GE_256-NEXT: ptrue p0.b, vl8
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0
+; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -406,6 +661,21 @@ define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
}
define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@@ -422,6 +692,24 @@ define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp)
}
define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ldr q0, [x1]
+; VBITS_GE_256-NEXT: ptrue p0.h, vl8
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0
+; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -438,6 +726,21 @@ define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
}
define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -454,6 +757,30 @@ define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
}
define <32 x i16> @masked_load_sext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16_m16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x9, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z1.h, #0
+; VBITS_GE_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: ptrue p1.b, vl16
+; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b
+; VBITS_GE_256-NEXT: ptrue p1.b, vl32
+; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16_m16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
@@ -470,6 +797,33 @@ define <32 x i16> @masked_load_sext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp
}
define <16 x i32> @masked_load_sext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32_m32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
+; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT: ptrue p1.b, vl16
+; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32_m32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@@ -486,6 +840,35 @@ define <16 x i32> @masked_load_sext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp
}
define <8 x i64> @masked_load_sext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64_m64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: ptrue p1.s, vl4
+; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT: ptrue p1.b, vl8
+; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z0.b, #0
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64_m64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -502,6 +885,32 @@ define <8 x i64> @masked_load_sext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0
}
define <16 x i32> @masked_load_sext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32_m32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
+; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT: ptrue p1.h, vl16
+; VBITS_GE_256-NEXT: sunpklo z0.h, z1.b
+; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0]
+; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32_m32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@@ -518,6 +927,33 @@ define <16 x i32> @masked_load_sext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %
}
define <8 x i64> @masked_load_sext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64_m64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: ptrue p1.s, vl4
+; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT: ptrue p1.h, vl8
+; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0]
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64_m64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -534,6 +970,30 @@ define <8 x i64> @masked_load_sext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp)
}
define <8 x i64> @masked_load_sext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64_m64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: ptrue p1.s, vl4
+; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT: ptrue p1.s, vl8
+; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0]
+; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64_m64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -550,6 +1010,30 @@ define <8 x i64> @masked_load_sext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp)
}
define <32 x i16> @masked_load_zext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16_m16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x9, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z1.h, #0
+; VBITS_GE_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: ptrue p1.b, vl16
+; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b
+; VBITS_GE_256-NEXT: ptrue p1.b, vl32
+; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16_m16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
@@ -566,6 +1050,33 @@ define <32 x i16> @masked_load_zext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp
}
define <16 x i32> @masked_load_zext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32_m32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
+; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT: ptrue p1.b, vl16
+; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32_m32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@@ -582,6 +1093,35 @@ define <16 x i32> @masked_load_zext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp
}
define <8 x i64> @masked_load_zext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64_m64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: ptrue p1.s, vl4
+; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT: ptrue p1.b, vl8
+; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z0.b, #0
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64_m64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -598,6 +1138,32 @@ define <8 x i64> @masked_load_zext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0
}
define <16 x i32> @masked_load_zext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32_m32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
+; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT: ptrue p1.h, vl16
+; VBITS_GE_256-NEXT: sunpklo z0.h, z1.b
+; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0]
+; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32_m32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@@ -614,6 +1180,33 @@ define <16 x i32> @masked_load_zext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %
}
define <8 x i64> @masked_load_zext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64_m64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: ptrue p1.s, vl4
+; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT: ptrue p1.h, vl8
+; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0]
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64_m64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -630,6 +1223,30 @@ define <8 x i64> @masked_load_zext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp)
}
define <8 x i64> @masked_load_zext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64_m64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: ptrue p1.s, vl4
+; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT: ptrue p1.s, vl8
+; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0]
+; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64_m64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -645,15 +1262,15 @@ define <8 x i64> @masked_load_zext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp)
ret <8 x i64> %ext
}
-define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_sext_v128i8i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1b { z0.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.h, p0/z, z0.h, #0
-; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_sext_v128i8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x8]
+; CHECK-NEXT: ret
%b = load <128 x i8>, <128 x i8>* %bp
%mask = icmp eq <128 x i8> %b, zeroinitializer
%load = call <128 x i8> @llvm.masked.load.v128i8(<128 x i8>* %ap, i32 8, <128 x i1> %mask, <128 x i8> undef)
@@ -661,15 +1278,15 @@ define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp)
ret <128 x i16> %ext
}
-define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_sext_v64i8i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1b { z0.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_sext_v64i8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
+; CHECK-NEXT: ld1sb { z0.s }, p1/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%b = load <64 x i8>, <64 x i8>* %bp
%mask = icmp eq <64 x i8> %b, zeroinitializer
%load = call <64 x i8> @llvm.masked.load.v64i8(<64 x i8>* %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
@@ -677,15 +1294,15 @@ define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0
ret <64 x i32> %ext
}
-define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_sext_v32i8i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_sext_v32i8i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT: ld1sb { z0.d }, p1/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: ret
%b = load <32 x i8>, <32 x i8>* %bp
%mask = icmp eq <32 x i8> %b, zeroinitializer
%load = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
@@ -693,15 +1310,15 @@ define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0
ret <32 x i64> %ext
}
-define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_sext_v64i16i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_sext_v64i16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%b = load <64 x i16>, <64 x i16>* %bp
%mask = icmp eq <64 x i16> %b, zeroinitializer
%load = call <64 x i16> @llvm.masked.load.v64i16(<64 x i16>* %ap, i32 8, <64 x i1> %mask, <64 x i16> undef)
@@ -709,15 +1326,15 @@ define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp)
ret <64 x i32> %ext
}
-define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_sext_v32i16i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_sext_v32i16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT: ld1sh { z0.d }, p1/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: ret
%b = load <32 x i16>, <32 x i16>* %bp
%mask = icmp eq <32 x i16> %b, zeroinitializer
%load = call <32 x i16> @llvm.masked.load.v32i16(<32 x i16>* %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
@@ -725,15 +1342,15 @@ define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp)
ret <32 x i64> %ext
}
-define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_sext_v32i32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_sext_v32i32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: ret
%b = load <32 x i32>, <32 x i32>* %bp
%mask = icmp eq <32 x i32> %b, zeroinitializer
%load = call <32 x i32> @llvm.masked.load.v32i32(<32 x i32>* %ap, i32 8, <32 x i1> %mask, <32 x i32> undef)
@@ -741,15 +1358,15 @@ define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp)
ret <32 x i64> %ext
}
-define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_zext_v128i8i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1b { z0.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.h, p0/z, z0.h, #0
-; VBITS_GE_2048-NEXT: ld1b { z0.h }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_zext_v128i8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0
+; CHECK-NEXT: ld1b { z0.h }, p1/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x8]
+; CHECK-NEXT: ret
%b = load <128 x i8>, <128 x i8>* %bp
%mask = icmp eq <128 x i8> %b, zeroinitializer
%load = call <128 x i8> @llvm.masked.load.v128i8(<128 x i8>* %ap, i32 8, <128 x i1> %mask, <128 x i8> undef)
@@ -757,15 +1374,15 @@ define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp)
ret <128 x i16> %ext
}
-define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_zext_v64i8i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1b { z0.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT: ld1b { z0.s }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_zext_v64i8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
+; CHECK-NEXT: ld1b { z0.s }, p1/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%b = load <64 x i8>, <64 x i8>* %bp
%mask = icmp eq <64 x i8> %b, zeroinitializer
%load = call <64 x i8> @llvm.masked.load.v64i8(<64 x i8>* %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
@@ -773,15 +1390,15 @@ define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0
ret <64 x i32> %ext
}
-define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_zext_v32i8i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1b { z0.d }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_zext_v32i8i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT: ld1b { z0.d }, p1/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: ret
%b = load <32 x i8>, <32 x i8>* %bp
%mask = icmp eq <32 x i8> %b, zeroinitializer
%load = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
@@ -789,15 +1406,15 @@ define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0
ret <32 x i64> %ext
}
-define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_zext_v64i16i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_zext_v64i16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
+; CHECK-NEXT: ld1h { z0.s }, p1/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: ret
%b = load <64 x i16>, <64 x i16>* %bp
%mask = icmp eq <64 x i16> %b, zeroinitializer
%load = call <64 x i16> @llvm.masked.load.v64i16(<64 x i16>* %ap, i32 8, <64 x i1> %mask, <64 x i16> undef)
@@ -805,15 +1422,15 @@ define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp)
ret <64 x i32> %ext
}
-define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_zext_v32i16i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_zext_v32i16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT: ld1h { z0.d }, p1/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: ret
%b = load <32 x i16>, <32 x i16>* %bp
%mask = icmp eq <32 x i16> %b, zeroinitializer
%load = call <32 x i16> @llvm.masked.load.v32i16(<32 x i16>* %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
@@ -821,15 +1438,15 @@ define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp)
ret <32 x i64> %ext
}
-define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_zext_v32i32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT: ret
+define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_zext_v32i32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x8]
+; CHECK-NEXT: ret
%b = load <32 x i32>, <32 x i32>* %bp
%mask = icmp eq <32 x i32> %b, zeroinitializer
%load = call <32 x i32> @llvm.masked.load.v32i32(<32 x i32>* %ap, i32 8, <32 x i1> %mask, <32 x i32> undef)
@@ -838,6 +1455,21 @@ define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp)
}
define <8 x i64> @masked_load_sext_ugt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_ugt_v8i32i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -854,6 +1486,21 @@ define <8 x i64> @masked_load_sext_ugt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp)
}
define <8 x i64> @masked_load_zext_sgt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_sgt_v8i32i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x9, #4
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpgt p0.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index edf937ab562e..58834bf39eb8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -1,31 +1,15 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function masked_scatter_v8i8,masked_scatter_v8i16,masked_scatter_v8i32,masked_scatter_v8i64 --prefix VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; ST1B
;
-define void @masked_scatter_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
+define void @masked_scatter_v2i8(<2 x i8>* %a, <2 x i8*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0]
@@ -47,7 +31,7 @@ define void @masked_scatter_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
ret void
}
-define void @masked_scatter_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
+define void @masked_scatter_v4i8(<4 x i8>* %a, <4 x i8*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
@@ -70,36 +54,36 @@ define void @masked_scatter_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
}
define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-LABEL: masked_scatter_v8i8:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: ldr d0, [x0]
-; VBITS_EQ_256-NEXT: mov x8, #4
-; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT: cmeq v1.8b, v0.8b, #0
-; VBITS_EQ_256-NEXT: zip1 v5.8b, v0.8b, v0.8b
-; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: zip1 v2.8b, v1.8b, v0.8b
-; VBITS_EQ_256-NEXT: zip2 v1.8b, v1.8b, v0.8b
-; VBITS_EQ_256-NEXT: zip2 v0.8b, v0.8b, v0.8b
-; VBITS_EQ_256-NEXT: shl v2.4h, v2.4h, #8
-; VBITS_EQ_256-NEXT: shl v1.4h, v1.4h, #8
-; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_EQ_256-NEXT: sshr v2.4h, v2.4h, #8
-; VBITS_EQ_256-NEXT: sshr v1.4h, v1.4h, #8
-; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h
-; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z2.d, #0
-; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z1.d, #0
-; VBITS_EQ_256-NEXT: uunpklo z1.s, z5.h
-; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s
-; VBITS_EQ_256-NEXT: st1b { z1.d }, p1, [z4.d]
-; VBITS_EQ_256-NEXT: st1b { z0.d }, p0, [z3.d]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: masked_scatter_v8i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ldr d0, [x0]
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: cmeq v1.8b, v0.8b, #0
+; VBITS_GE_256-NEXT: zip1 v5.8b, v0.8b, v0.8b
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: zip1 v2.8b, v1.8b, v0.8b
+; VBITS_GE_256-NEXT: zip2 v1.8b, v1.8b, v0.8b
+; VBITS_GE_256-NEXT: zip2 v0.8b, v0.8b, v0.8b
+; VBITS_GE_256-NEXT: shl v2.4h, v2.4h, #8
+; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sshr v2.4h, v2.4h, #8
+; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0
+; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT: uunpklo z1.s, z5.h
+; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1b { z1.d }, p1, [z4.d]
+; VBITS_GE_256-NEXT: st1b { z0.d }, p0, [z3.d]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_scatter_v8i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ldr d0, [x0]
@@ -122,22 +106,22 @@ define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
ret void
}
-define void @masked_scatter_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ldr q0, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: cmeq v2.16b, v0.16b, #0
-; VBITS_GE_1024-NEXT: uunpklo z0.h, z0.b
-; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: sunpklo z2.h, z2.b
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_1024-NEXT: sunpklo z2.d, z2.s
-; VBITS_GE_1024-NEXT: cmpne p0.d, p0/z, z2.d, #0
-; VBITS_GE_1024-NEXT: st1b { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_scatter_v16i8(<16 x i8>* %a, <16 x i8*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmeq v2.16b, v0.16b, #0
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: sunpklo z2.h, z2.b
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEXT: sunpklo z2.d, z2.s
+; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0
+; CHECK-NEXT: st1b { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <16 x i8>, <16 x i8>* %a
%ptrs = load <16 x i8*>, <16 x i8*>* %b
%mask = icmp eq <16 x i8> %vals, zeroinitializer
@@ -145,22 +129,22 @@ define void @masked_scatter_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
ret void
}
-define void @masked_scatter_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32i8:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.b, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0
-; VBITS_GE_2048-NEXT: uunpklo z0.h, z0.b
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1b { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_v32i8(<32 x i8>* %a, <32 x i8*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1b { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <32 x i8>, <32 x i8>* %a
%ptrs = load <32 x i8*>, <32 x i8*>* %b
%mask = icmp eq <32 x i8> %vals, zeroinitializer
@@ -172,7 +156,7 @@ define void @masked_scatter_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
; ST1H
;
-define void @masked_scatter_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
+define void @masked_scatter_v2i16(<2 x i16>* %a, <2 x i16*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0]
@@ -194,7 +178,7 @@ define void @masked_scatter_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
ret void
}
-define void @masked_scatter_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
+define void @masked_scatter_v4i16(<4 x i16>* %a, <4 x i16*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -216,30 +200,30 @@ define void @masked_scatter_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
}
define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-LABEL: masked_scatter_v8i16:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: ldr q0, [x0]
-; VBITS_EQ_256-NEXT: mov x8, #4
-; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT: cmeq v1.8h, v0.8h, #0
-; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_EQ_256-NEXT: sunpklo z2.s, z1.h
-; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_EQ_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z2.d, #0
-; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: uunpklo z3.s, z3.h
-; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h
-; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s
-; VBITS_EQ_256-NEXT: st1h { z0.d }, p1, [z2.d]
-; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z1.d, #0
-; VBITS_EQ_256-NEXT: uunpklo z1.d, z3.s
-; VBITS_EQ_256-NEXT: st1h { z1.d }, p0, [z4.d]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: masked_scatter_v8i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ldr q0, [x0]
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: cmeq v1.8h, v0.8h, #0
+; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8
+; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1h { z0.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT: uunpklo z1.d, z3.s
+; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [z4.d]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_scatter_v8i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ldr q0, [x0]
@@ -260,20 +244,20 @@ define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
ret void
}
-define void @masked_scatter_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq p0.h, p0/z, z0.h, #0
-; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_scatter_v16i16(<16 x i16>* %a, <16 x i16*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ptrue p1.d, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <16 x i16>, <16 x i16>* %a
%ptrs = load <16 x i16*>, <16 x i16*>* %b
%mask = icmp eq <16 x i16> %vals, zeroinitializer
@@ -281,20 +265,20 @@ define void @masked_scatter_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
ret void
}
-define void @masked_scatter_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_v32i16(<32 x i16>* %a, <32 x i16*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <32 x i16>, <32 x i16>* %a
%ptrs = load <32 x i16*>, <32 x i16*>* %b
%mask = icmp eq <32 x i16> %vals, zeroinitializer
@@ -306,7 +290,7 @@ define void @masked_scatter_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
; ST1W
;
-define void @masked_scatter_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
+define void @masked_scatter_v2i32(<2 x i32>* %a, <2 x i32*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -325,7 +309,7 @@ define void @masked_scatter_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
ret void
}
-define void @masked_scatter_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
+define void @masked_scatter_v4i32(<4 x i32>* %a, <4 x i32*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -345,28 +329,28 @@ define void @masked_scatter_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
}
define void @masked_scatter_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-LABEL: masked_scatter_v8i32:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT: mov x8, #4
-; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_256-NEXT: ptrue p1.d, vl4
-; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1]
-; VBITS_EQ_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0
-; VBITS_EQ_256-NEXT: uunpklo z4.d, z0.s
-; VBITS_EQ_256-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_EQ_256-NEXT: punpklo p0.h, p0.b
-; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16
-; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s
-; VBITS_EQ_256-NEXT: and p0.b, p0/z, p0.b, p1.b
-; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z2.d, #0
-; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_EQ_256-NEXT: st1w { z4.d }, p0, [z3.d]
-; VBITS_EQ_256-NEXT: st1w { z0.d }, p1, [z1.d]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: masked_scatter_v8i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p1.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT: uunpklo z4.d, z0.s
+; VBITS_GE_256-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT: and p0.b, p0/z, p0.b, p1.b
+; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z2.d, #0
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1w { z4.d }, p0, [z3.d]
+; VBITS_GE_256-NEXT: st1w { z0.d }, p1, [z1.d]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_scatter_v8i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
@@ -385,18 +369,18 @@ define void @masked_scatter_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
ret void
}
-define void @masked_scatter_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq p0.s, p0/z, z0.s, #0
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_scatter_v16i32(<16 x i32>* %a, <16 x i32*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ptrue p1.d, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1w { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <16 x i32>, <16 x i32>* %a
%ptrs = load <16 x i32*>, <16 x i32*>* %b
%mask = icmp eq <16 x i32> %vals, zeroinitializer
@@ -404,18 +388,18 @@ define void @masked_scatter_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
ret void
}
-define void @masked_scatter_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_v32i32(<32 x i32>* %a, <32 x i32*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1w { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <32 x i32>, <32 x i32>* %a
%ptrs = load <32 x i32*>, <32 x i32*>* %b
%mask = icmp eq <32 x i32> %vals, zeroinitializer
@@ -428,7 +412,7 @@ define void @masked_scatter_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
;
; Scalarize 1 x i64 scatters
-define void @masked_scatter_v1i64(<1 x i64>* %a, <1 x i64*>* %b) #0 {
+define void @masked_scatter_v1i64(<1 x i64>* %a, <1 x i64*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -447,7 +431,7 @@ define void @masked_scatter_v1i64(<1 x i64>* %a, <1 x i64*>* %b) #0 {
ret void
}
-define void @masked_scatter_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
+define void @masked_scatter_v2i64(<2 x i64>* %a, <2 x i64*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -464,7 +448,7 @@ define void @masked_scatter_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
ret void
}
-define void @masked_scatter_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
+define void @masked_scatter_v4i64(<4 x i64>* %a, <4 x i64*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -481,20 +465,20 @@ define void @masked_scatter_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
}
define void @masked_scatter_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-LABEL: masked_scatter_v8i64:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: mov x8, #4
-; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0]
-; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_EQ_256-NEXT: cmpeq p0.d, p0/z, z1.d, #0
-; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [z3.d]
-; VBITS_EQ_256-NEXT: st1d { z0.d }, p1, [z2.d]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: masked_scatter_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [z3.d]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_scatter_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -510,15 +494,15 @@ define void @masked_scatter_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
ret void
}
-define void @masked_scatter_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq p0.d, p0/z, z0.d, #0
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_scatter_v16i64(<16 x i64>* %a, <16 x i64*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <16 x i64>, <16 x i64>* %a
%ptrs = load <16 x i64*>, <16 x i64*>* %b
%mask = icmp eq <16 x i64> %vals, zeroinitializer
@@ -526,15 +510,15 @@ define void @masked_scatter_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
ret void
}
-define void @masked_scatter_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p0.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_v32i64(<32 x i64>* %a, <32 x i64*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <32 x i64>, <32 x i64>* %a
%ptrs = load <32 x i64*>, <32 x i64*>* %b
%mask = icmp eq <32 x i64> %vals, zeroinitializer
@@ -546,7 +530,7 @@ define void @masked_scatter_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 {
; ST1H (float)
;
-define void @masked_scatter_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 {
+define void @masked_scatter_v2f16(<2 x half>* %a, <2 x half*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s1, [x0]
@@ -580,7 +564,7 @@ define void @masked_scatter_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 {
ret void
}
-define void @masked_scatter_v4f16(<4 x half>* %a, <4 x half*>* %b) #0 {
+define void @masked_scatter_v4f16(<4 x half>* %a, <4 x half*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -602,6 +586,30 @@ define void @masked_scatter_v4f16(<4 x half>* %a, <4 x half*>* %b) #0 {
}
define void @masked_scatter_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 {
+; VBITS_GE_256-LABEL: masked_scatter_v8f16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ldr q0, [x0]
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: fcmeq v1.8h, v0.8h, #0.0
+; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8
+; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: st1h { z0.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT: uunpklo z1.d, z3.s
+; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [z4.d]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_scatter_v8f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ldr q0, [x0]
@@ -622,20 +630,20 @@ define void @masked_scatter_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 {
ret void
}
-define void @masked_scatter_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
-; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_scatter_v16f16(<16 x half>* %a, <16 x half*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ptrue p1.d, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <16 x half>, <16 x half>* %a
%ptrs = load <16 x half*>, <16 x half*>* %b
%mask = fcmp oeq <16 x half> %vals, zeroinitializer
@@ -643,20 +651,20 @@ define void @masked_scatter_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 {
ret void
}
-define void @masked_scatter_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_v32f16(<32 x half>* %a, <32 x half*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <32 x half>, <32 x half>* %a
%ptrs = load <32 x half*>, <32 x half*>* %b
%mask = fcmp oeq <32 x half> %vals, zeroinitializer
@@ -668,7 +676,7 @@ define void @masked_scatter_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 {
; ST1W (float)
;
-define void @masked_scatter_v2f32(<2 x float>* %a, <2 x float*>* %b) #0 {
+define void @masked_scatter_v2f32(<2 x float>* %a, <2 x float*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -687,7 +695,7 @@ define void @masked_scatter_v2f32(<2 x float>* %a, <2 x float*>* %b) #0 {
ret void
}
-define void @masked_scatter_v4f32(<4 x float>* %a, <4 x float*>* %b) #0 {
+define void @masked_scatter_v4f32(<4 x float>* %a, <4 x float*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -707,6 +715,28 @@ define void @masked_scatter_v4f32(<4 x float>* %a, <4 x float*>* %b) #0 {
}
define void @masked_scatter_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 {
+; VBITS_GE_256-LABEL: masked_scatter_v8f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p1.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1]
+; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
+; VBITS_GE_256-NEXT: uunpklo z4.d, z0.s
+; VBITS_GE_256-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT: and p0.b, p0/z, p0.b, p1.b
+; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z2.d, #0
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: st1w { z4.d }, p0, [z3.d]
+; VBITS_GE_256-NEXT: st1w { z0.d }, p1, [z1.d]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_scatter_v8f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
@@ -725,18 +755,18 @@ define void @masked_scatter_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 {
ret void
}
-define void @masked_scatter_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_scatter_v16f32(<16 x float>* %a, <16 x float*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: ptrue p1.d, vl16
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1w { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <16 x float>, <16 x float>* %a
%ptrs = load <16 x float*>, <16 x float*>* %b
%mask = fcmp oeq <16 x float> %vals, zeroinitializer
@@ -744,18 +774,18 @@ define void @masked_scatter_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 {
ret void
}
-define void @masked_scatter_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_v32f32(<32 x float>* %a, <32 x float*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1w { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <32 x float>, <32 x float>* %a
%ptrs = load <32 x float*>, <32 x float*>* %b
%mask = fcmp oeq <32 x float> %vals, zeroinitializer
@@ -768,7 +798,7 @@ define void @masked_scatter_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 {
;
; Scalarize 1 x double scatters
-define void @masked_scatter_v1f64(<1 x double>* %a, <1 x double*>* %b) #0 {
+define void @masked_scatter_v1f64(<1 x double>* %a, <1 x double*>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: masked_scatter_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -787,7 +817,7 @@ define void @masked_scatter_v1f64(<1 x double>* %a, <1 x double*>* %b) #0 {
ret void
}
-define void @masked_scatter_v2f64(<2 x double>* %a, <2 x double*>* %b) #0 {
+define void @masked_scatter_v2f64(<2 x double>* %a, <2 x double*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -804,7 +834,7 @@ define void @masked_scatter_v2f64(<2 x double>* %a, <2 x double*>* %b) #0 {
ret void
}
-define void @masked_scatter_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 {
+define void @masked_scatter_v4f64(<4 x double>* %a, <4 x double*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -821,6 +851,20 @@ define void @masked_scatter_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 {
}
define void @masked_scatter_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 {
+; VBITS_GE_256-LABEL: masked_scatter_v8f64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
+; VBITS_GE_256-NEXT: fcmeq p0.d, p0/z, z1.d, #0.0
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [z3.d]
+; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT: ret
+;
; VBITS_GE_512-LABEL: masked_scatter_v8f64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@@ -836,15 +880,15 @@ define void @masked_scatter_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 {
ret void
}
-define void @masked_scatter_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_scatter_v16f64(<16 x double>* %a, <16 x double*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT: st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <16 x double>, <16 x double>* %a
%ptrs = load <16 x double*>, <16 x double*>* %b
%mask = fcmp oeq <16 x double> %vals, zeroinitializer
@@ -852,15 +896,15 @@ define void @masked_scatter_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 {
ret void
}
-define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT: st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
%vals = load <32 x double>, <32 x double>* %a
%ptrs = load <32 x double*>, <32 x double*>* %b
%mask = fcmp oeq <32 x double> %vals, zeroinitializer
@@ -871,18 +915,18 @@ define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
; The above tests test the types, the below tests check that the addressing
; modes still function
-define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_sext_f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.s, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw #1]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_32b_scaled_sext_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw #1]
+; CHECK-NEXT: ret
%vals = load <32 x half>, <32 x half>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = sext <32 x i32> %idxs to <32 x i64>
@@ -892,15 +936,15 @@ define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
ret void
}
-define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_sext_f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x2, z1.s, sxtw #2]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_32b_scaled_sext_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: st1w { z0.s }, p0, [x2, z1.s, sxtw #2]
+; CHECK-NEXT: ret
%vals = load <32 x float>, <32 x float>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = sext <32 x i32> %idxs to <32 x i64>
@@ -910,15 +954,15 @@ define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b
ret void
}
-define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_sext_f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x2, z1.d, lsl #3]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_32b_scaled_sext_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT: st1d { z0.d }, p0, [x2, z1.d, lsl #3]
+; CHECK-NEXT: ret
%vals = load <32 x double>, <32 x double>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = sext <32 x i32> %idxs to <32 x i64>
@@ -928,18 +972,18 @@ define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %
ret void
}
-define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_zext:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.s, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw #1]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_32b_scaled_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw #1]
+; CHECK-NEXT: ret
%vals = load <32 x half>, <32 x half>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = zext <32 x i32> %idxs to <32 x i64>
@@ -949,18 +993,18 @@ define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, hal
ret void
}
-define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_32b_unscaled_sext:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.s, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_32b_unscaled_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw]
+; CHECK-NEXT: ret
%vals = load <32 x half>, <32 x half>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = sext <32 x i32> %idxs to <32 x i64>
@@ -971,18 +1015,18 @@ define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i
ret void
}
-define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_32b_unscaled_zext:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.s, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_32b_unscaled_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: ptrue p1.s, vl32
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw]
+; CHECK-NEXT: ret
%vals = load <32 x half>, <32 x half>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = zext <32 x i32> %idxs to <32 x i64>
@@ -993,18 +1037,18 @@ define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i
ret void
}
-define void @masked_scatter_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_64b_scaled:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_64b_scaled:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2]
+; CHECK-NEXT: ret
%vals = load <32 x float>, <32 x float>* %a
%idxs = load <32 x i64>, <32 x i64>* %b
%ptrs = getelementptr float, float* %base, <32 x i64> %idxs
@@ -1013,18 +1057,18 @@ define void @masked_scatter_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float*
ret void
}
-define void @masked_scatter_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_64b_unscaled:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_64b_unscaled:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d]
+; CHECK-NEXT: ret
%vals = load <32 x float>, <32 x float>* %a
%idxs = load <32 x i64>, <32 x i64>* %b
%byte_ptrs = getelementptr i8, i8* %base, <32 x i64> %idxs
@@ -1034,18 +1078,18 @@ define void @masked_scatter_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %
ret void
}
-define void @masked_scatter_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_vec_plus_reg:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_vec_plus_reg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d]
+; CHECK-NEXT: ret
%vals = load <32 x float>, <32 x float>* %a
%bases = load <32 x i8*>, <32 x i8*>* %b
%byte_ptrs = getelementptr i8, <32 x i8*> %bases, i64 %off
@@ -1055,18 +1099,18 @@ define void @masked_scatter_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %
ret void
}
-define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_vec_plus_imm:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d, #4]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_vec_plus_imm:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ptrue p1.d, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: st1w { z0.d }, p0, [z1.d, #4]
+; CHECK-NEXT: ret
%vals = load <32 x float>, <32 x float>* %a
%bases = load <32 x i8*>, <32 x i8*>* %b
%byte_ptrs = getelementptr i8, <32 x i8*> %bases, i64 4
@@ -1084,18 +1128,18 @@ define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
; NOTE: For this test to function correctly it's critical for %vals to be in a
;
diff erent block to the scatter store. If not, the problematic bitcast will be
; removed before operation legalisation and thus not exercise the combine.
-define void @masked_scatter_bitcast_infinite_loop(<8 x double>* %a, <8 x double*>* %b, i1 %cond) #0 {
-; VBITS_GE_512-LABEL: masked_scatter_bitcast_infinite_loop:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: tbz w2, #0, .LBB47_2
-; VBITS_GE_512-NEXT: // %bb.1: // %bb.1
-; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [z1.d]
-; VBITS_GE_512-NEXT: .LBB47_2: // %bb.2
-; VBITS_GE_512-NEXT: ret
+define void @masked_scatter_bitcast_infinite_loop(<8 x double>* %a, <8 x double*>* %b, i1 %cond) vscale_range(4,0) #0 {
+; CHECK-LABEL: masked_scatter_bitcast_infinite_loop:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl8
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: tbz w2, #0, .LBB47_2
+; CHECK-NEXT: // %bb.1: // %bb.1
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT: st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT: .LBB47_2: // %bb.2
+; CHECK-NEXT: ret
%vals = load volatile <8 x double>, <8 x double>* %a
br i1 %cond, label %bb.1, label %bb.2
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
index 0e64e78d5505..3d6099e9a792 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
@@ -1,28 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-;;
-;; Masked Stores
-;;
-define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
+;
+; Masked Stores
+;
+
+define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_store_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s1, [x0]
@@ -52,8 +39,7 @@ define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
ret void
}
-
-define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
+define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_store_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@@ -70,7 +56,7 @@ define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
ret void
}
-define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
+define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_store_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -87,7 +73,7 @@ define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
ret void
}
-define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
+define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_store_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -133,39 +119,15 @@ define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 {
ret void
}
-define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
-; VBITS_GE_256-LABEL: masked_store_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z0.s, z6.s
-; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z3.s, z7.s
-; VBITS_GE_256-NEXT: st1w { z0.s }, p3, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p2, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p1, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: masked_store_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_store_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%a = load <32 x float>, <32 x float>* %ap
%b = load <32 x float>, <32 x float>* %bp
%mask = fcmp oeq <32 x float> %a, %b
@@ -173,59 +135,15 @@ define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
ret void
}
-define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 {
-; VBITS_GE_256-LABEL: masked_store_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #56
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: mov x10, #40
-; VBITS_GE_256-NEXT: mov x11, #32
-; VBITS_GE_256-NEXT: mov x12, #24
-; VBITS_GE_256-NEXT: mov x13, #16
-; VBITS_GE_256-NEXT: mov x14, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s
-; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s
-; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z4.s, z19.s
-; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z18.s
-; VBITS_GE_256-NEXT: fcmeq p5.s, p0/z, z2.s, z21.s
-; VBITS_GE_256-NEXT: fcmeq p6.s, p0/z, z1.s, z20.s
-; VBITS_GE_256-NEXT: fcmeq p7.s, p0/z, z0.s, z22.s
-; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z7.s, z23.s
-; VBITS_GE_256-NEXT: st1w { z0.s }, p7, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p6, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p5, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p4, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p3, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p2, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z6.s }, p1, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: masked_store_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_store_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%a = load <64 x float>, <64 x float>* %ap
%b = load <64 x float>, <64 x float>* %bp
%mask = fcmp oeq <64 x float> %a, %b
@@ -266,7 +184,6 @@ define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>
; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d
; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
-
%a = load <8 x i64>, <8 x i64>* %ap
%b = load <8 x i64>, <8 x i64>* %bp
%mask = icmp eq <8 x i64> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
index 4156689233fc..27389a7f1eef 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@@ -163,27 +163,27 @@ define void @test_revwv8i32v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
; REVH pattern for shuffle v32i16 with 256 bits and 512 bits SVE.
define void @test_revhv32i16(<32 x i16>* %a) #0 {
-; VBITS_EQ_256-LABEL: test_revhv32i16:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: mov x8, #16
-; VBITS_EQ_256-NEXT: ptrue p0.h, vl16
-; VBITS_EQ_256-NEXT: ptrue p1.d
-; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_EQ_256-NEXT: revh z0.d, p1/m, z0.d
-; VBITS_EQ_256-NEXT: revh z1.d, p1/m, z1.d
-; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0]
-; VBITS_EQ_256-NEXT: ret
-;
; VBITS_GE_256-LABEL: test_revhv32i16:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ptrue p0.h, vl32
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ptrue p1.d
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: revh z0.d, p1/m, z0.d
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT: revh z1.d, p1/m, z1.d
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: test_revhv32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ptrue p1.d
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: revh z0.d, p1/m, z0.d
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%tmp1 = load <32 x i16>, <32 x i16>* %a
%tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
store <32 x i16> %tmp2, <32 x i16>* %a
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll
index e0857f4c71c7..24d2095b57af 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll
@@ -1,54 +1,46 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; RBIT
;
-define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) #0 {
+define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v8i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
-; CHECK-NEXT: rbit z0.b, [[PG]]/m, z0.b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: rbit z0.b, p0/m, z0.b
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op)
ret <8 x i8> %res
}
-define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) #0 {
+define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v16i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
-; CHECK-NEXT: rbit z0.b, [[PG]]/m, z0.b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: rbit z0.b, p0/m, z0.b
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op)
ret <16 x i8> %res
}
-define void @bitreverse_v32i8(<32 x i8>* %a) #0 {
+define void @bitreverse_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op)
store <32 x i8> %res, <32 x i8>* %a
@@ -56,80 +48,91 @@ define void @bitreverse_v32i8(<32 x i8>* %a) #0 {
}
define void @bitreverse_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: bitreverse_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: bitreverse_v64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: rbit z0.b, p0/m, z0.b
+; VBITS_GE_256-NEXT: rbit z1.b, p0/m, z1.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP_LO]].b
-; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_512-LABEL: bitreverse_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: rbit z0.b, p0/m, z0.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %op)
store <64 x i8> %res, <64 x i8>* %a
ret void
}
-define void @bitreverse_v128i8(<128 x i8>* %a) #0 {
+define void @bitreverse_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bitreverse_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call <128 x i8> @llvm.bitreverse.v128i8(<128 x i8> %op)
store <128 x i8> %res, <128 x i8>* %a
ret void
}
-define void @bitreverse_v256i8(<256 x i8>* %a) #0 {
+define void @bitreverse_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bitreverse_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.b, p0/m, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call <256 x i8> @llvm.bitreverse.v256i8(<256 x i8> %op)
store <256 x i8> %res, <256 x i8>* %a
ret void
}
-define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) #0 {
+define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v4i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK-NEXT: rbit z0.h, [[PG]]/m, z0.h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: rbit z0.h, p0/m, z0.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op)
ret <4 x i16> %res
}
-define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) #0 {
+define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v8i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK-NEXT: rbit z0.h, [[PG]]/m, z0.h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: rbit z0.h, p0/m, z0.h
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op)
ret <8 x i16> %res
}
-define void @bitreverse_v16i16(<16 x i16>* %a) #0 {
+define void @bitreverse_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op)
store <16 x i16> %res, <16 x i16>* %a
@@ -137,80 +140,91 @@ define void @bitreverse_v16i16(<16 x i16>* %a) #0 {
}
define void @bitreverse_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: bitreverse_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: bitreverse_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: rbit z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: rbit z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: bitreverse_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: rbit z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %op)
store <32 x i16> %res, <32 x i16>* %a
ret void
}
-define void @bitreverse_v64i16(<64 x i16>* %a) #0 {
+define void @bitreverse_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bitreverse_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call <64 x i16> @llvm.bitreverse.v64i16(<64 x i16> %op)
store <64 x i16> %res, <64 x i16>* %a
ret void
}
-define void @bitreverse_v128i16(<128 x i16>* %a) #0 {
+define void @bitreverse_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bitreverse_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call <128 x i16> @llvm.bitreverse.v128i16(<128 x i16> %op)
store <128 x i16> %res, <128 x i16>* %a
ret void
}
-define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) #0 {
+define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK-NEXT: rbit z0.s, [[PG]]/m, z0.s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: rbit z0.s, p0/m, z0.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op)
ret <2 x i32> %res
}
-define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) #0 {
+define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK-NEXT: rbit z0.s, [[PG]]/m, z0.s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: rbit z0.s, p0/m, z0.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op)
ret <4 x i32> %res
}
-define void @bitreverse_v8i32(<8 x i32>* %a) #0 {
+define void @bitreverse_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op)
store <8 x i32> %res, <8 x i32>* %a
@@ -218,80 +232,91 @@ define void @bitreverse_v8i32(<8 x i32>* %a) #0 {
}
define void @bitreverse_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: bitreverse_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: bitreverse_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: rbit z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: rbit z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: bitreverse_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: rbit z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %op)
store <16 x i32> %res, <16 x i32>* %a
ret void
}
-define void @bitreverse_v32i32(<32 x i32>* %a) #0 {
+define void @bitreverse_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bitreverse_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call <32 x i32> @llvm.bitreverse.v32i32(<32 x i32> %op)
store <32 x i32> %res, <32 x i32>* %a
ret void
}
-define void @bitreverse_v64i32(<64 x i32>* %a) #0 {
+define void @bitreverse_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bitreverse_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call <64 x i32> @llvm.bitreverse.v64i32(<64 x i32> %op)
store <64 x i32> %res, <64 x i32>* %a
ret void
}
-define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) #0 {
+define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: rbit z0.d, [[PG]]/m, z0.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: rbit z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op)
ret <1 x i64> %res
}
-define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) #0 {
+define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: rbit z0.d, [[PG]]/m, z0.d
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: rbit z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op)
ret <2 x i64> %res
}
-define void @bitreverse_v4i64(<4 x i64>* %a) #0 {
+define void @bitreverse_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op)
store <4 x i64> %res, <4 x i64>* %a
@@ -299,49 +324,53 @@ define void @bitreverse_v4i64(<4 x i64>* %a) #0 {
}
define void @bitreverse_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: bitreverse_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: bitreverse_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: rbit z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: rbit z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: bitreverse_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: rbit z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %op)
store <8 x i64> %res, <8 x i64>* %a
ret void
}
-define void @bitreverse_v16i64(<16 x i64>* %a) #0 {
+define void @bitreverse_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bitreverse_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call <16 x i64> @llvm.bitreverse.v16i64(<16 x i64> %op)
store <16 x i64> %res, <16 x i64>* %a
ret void
}
-define void @bitreverse_v32i64(<32 x i64>* %a) #0 {
+define void @bitreverse_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bitreverse_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: rbit z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call <32 x i64> @llvm.bitreverse.v32i64(<32 x i64> %op)
store <32 x i64> %res, <32 x i64>* %a
@@ -353,30 +382,33 @@ define void @bitreverse_v32i64(<32 x i64>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @bswap_v4i16(<4 x i16> %op) #0 {
+define <4 x i16> @bswap_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v4i16:
-; CHECK: rev16 v0.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev16 v0.8b, v0.8b
+; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op)
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @bswap_v8i16(<8 x i16> %op) #0 {
+define <8 x i16> @bswap_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v8i16:
-; CHECK: rev16 v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev16 v0.16b, v0.16b
+; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op)
ret <8 x i16> %res
}
-define void @bswap_v16i16(<16 x i16>* %a) #0 {
+define void @bswap_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: revb z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op)
store <16 x i16> %res, <16 x i16>* %a
@@ -384,49 +416,53 @@ define void @bswap_v16i16(<16 x i16>* %a) #0 {
}
define void @bswap_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: bswap_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: revb [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: revb [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: bswap_v32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: revb z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT: revb z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: bswap_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: revb z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %op)
store <32 x i16> %res, <32 x i16>* %a
ret void
}
-define void @bswap_v64i16(<64 x i16>* %a) #0 {
+define void @bswap_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bswap_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: revb z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call <64 x i16> @llvm.bswap.v64i16(<64 x i16> %op)
store <64 x i16> %res, <64 x i16>* %a
ret void
}
-define void @bswap_v128i16(<128 x i16>* %a) #0 {
+define void @bswap_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bswap_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: revb z0.h, p0/m, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call <128 x i16> @llvm.bswap.v128i16(<128 x i16> %op)
store <128 x i16> %res, <128 x i16>* %a
@@ -434,30 +470,33 @@ define void @bswap_v128i16(<128 x i16>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @bswap_v2i32(<2 x i32> %op) #0 {
+define <2 x i32> @bswap_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v2i32:
-; CHECK: rev32 v0.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev32 v0.8b, v0.8b
+; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op)
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @bswap_v4i32(<4 x i32> %op) #0 {
+define <4 x i32> @bswap_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v4i32:
-; CHECK: rev32 v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev32 v0.16b, v0.16b
+; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op)
ret <4 x i32> %res
}
-define void @bswap_v8i32(<8 x i32>* %a) #0 {
+define void @bswap_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: revb z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op)
store <8 x i32> %res, <8 x i32>* %a
@@ -465,49 +504,53 @@ define void @bswap_v8i32(<8 x i32>* %a) #0 {
}
define void @bswap_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: bswap_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: revb [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: revb [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: bswap_v16i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: revb z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT: revb z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: bswap_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: revb z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %op)
store <16 x i32> %res, <16 x i32>* %a
ret void
}
-define void @bswap_v32i32(<32 x i32>* %a) #0 {
+define void @bswap_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bswap_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: revb z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call <32 x i32> @llvm.bswap.v32i32(<32 x i32> %op)
store <32 x i32> %res, <32 x i32>* %a
ret void
}
-define void @bswap_v64i32(<64 x i32>* %a) #0 {
+define void @bswap_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bswap_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: revb z0.s, p0/m, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call <64 x i32> @llvm.bswap.v64i32(<64 x i32> %op)
store <64 x i32> %res, <64 x i32>* %a
@@ -515,30 +558,33 @@ define void @bswap_v64i32(<64 x i32>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @bswap_v1i64(<1 x i64> %op) #0 {
+define <1 x i64> @bswap_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v1i64:
-; CHECK: rev64 v0.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev64 v0.8b, v0.8b
+; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op)
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @bswap_v2i64(<2 x i64> %op) #0 {
+define <2 x i64> @bswap_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v2i64:
-; CHECK: rev64 v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev64 v0.16b, v0.16b
+; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op)
ret <2 x i64> %res
}
-define void @bswap_v4i64(<4 x i64>* %a) #0 {
+define void @bswap_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: revb z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)
store <4 x i64> %res, <4 x i64>* %a
@@ -546,49 +592,53 @@ define void @bswap_v4i64(<4 x i64>* %a) #0 {
}
define void @bswap_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: bswap_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: revb [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: revb [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: bswap_v8i64:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: revb z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT: revb z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: bswap_v8i64:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: revb z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %op)
store <8 x i64> %res, <8 x i64>* %a
ret void
}
-define void @bswap_v16i64(<16 x i64>* %a) #0 {
+define void @bswap_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bswap_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: revb z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call <16 x i64> @llvm.bswap.v16i64(<16 x i64> %op)
store <16 x i64> %res, <16 x i64>* %a
ret void
}
-define void @bswap_v32i64(<32 x i64>* %a) #0 {
+define void @bswap_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bswap_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: revb z0.d, p0/m, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call <32 x i64> @llvm.bswap.v32i64(<32 x i64> %op)
store <32 x i64> %res, <32 x i64>* %a
@@ -640,4 +690,3 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
declare <8 x i64> @llvm.bswap.v8i64(<8 x i64>)
declare <16 x i64> @llvm.bswap.v16i64(<16 x i64>)
declare <32 x i64> @llvm.bswap.v32i64(<32 x i64>)
-
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
index 245f34248a54..e62cbcda9c7f 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
@@ -1,23 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) #0 {
+define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -29,7 +17,7 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) #0 {
ret <8 x i8> %res
}
-define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) #0 {
+define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -41,7 +29,7 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) #0 {
ret <16 x i8> %res
}
-define void @sdiv_v32i8(<32 x i8>* %a) #0 {
+define void @sdiv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
@@ -81,91 +69,35 @@ define void @sdiv_v64i8(<64 x i8>* %a) #0 {
ret void
}
-define void @sdiv_v128i8(<128 x i8>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v128i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w8, #96
-; VBITS_GE_256-NEXT: mov w9, #32
-; VBITS_GE_256-NEXT: mov w10, #64
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
-; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9]
-; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10]
-; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT: asrd z1.b, p0/m, z1.b, #5
-; VBITS_GE_256-NEXT: asrd z0.b, p0/m, z0.b, #5
-; VBITS_GE_256-NEXT: asrd z2.b, p0/m, z2.b, #5
-; VBITS_GE_256-NEXT: asrd z3.b, p0/m, z3.b, #5
-; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9]
-; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: sdiv_v128i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: asrd z0.b, p0/m, z0.b, #5
-; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @sdiv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: sdiv_v128i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%res = sdiv <128 x i8> %op1, shufflevector (<128 x i8> insertelement (<128 x i8> poison, i8 32, i32 0), <128 x i8> poison, <128 x i32> zeroinitializer)
store <128 x i8> %res, <128 x i8>* %a
ret void
}
-define void @sdiv_v256i8(<256 x i8>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v256i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w8, #192
-; VBITS_GE_256-NEXT: mov w9, #96
-; VBITS_GE_256-NEXT: mov w10, #32
-; VBITS_GE_256-NEXT: mov w11, #160
-; VBITS_GE_256-NEXT: mov w12, #64
-; VBITS_GE_256-NEXT: mov w13, #224
-; VBITS_GE_256-NEXT: mov w14, #128
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x9]
-; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x10]
-; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x11]
-; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x12]
-; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x0, x13]
-; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x0, x14]
-; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x0, x8]
-; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT: asrd z1.b, p0/m, z1.b, #5
-; VBITS_GE_256-NEXT: asrd z0.b, p0/m, z0.b, #5
-; VBITS_GE_256-NEXT: asrd z3.b, p0/m, z3.b, #5
-; VBITS_GE_256-NEXT: asrd z2.b, p0/m, z2.b, #5
-; VBITS_GE_256-NEXT: asrd z5.b, p0/m, z5.b, #5
-; VBITS_GE_256-NEXT: asrd z4.b, p0/m, z4.b, #5
-; VBITS_GE_256-NEXT: asrd z6.b, p0/m, z6.b, #5
-; VBITS_GE_256-NEXT: asrd z7.b, p0/m, z7.b, #5
-; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x13]
-; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x14]
-; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x11]
-; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x12]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x9]
-; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x10]
-; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: sdiv_v256i8:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: asrd z0.b, p0/m, z0.b, #5
-; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @sdiv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: sdiv_v256i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%res = sdiv <256 x i8> %op1, shufflevector (<256 x i8> insertelement (<256 x i8> poison, i8 32, i32 0), <256 x i8> poison, <256 x i32> zeroinitializer)
store <256 x i8> %res, <256 x i8>* %a
ret void
}
-define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) #0 {
+define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -177,7 +109,7 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) #0 {
ret <4 x i16> %res
}
-define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) #0 {
+define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -189,7 +121,7 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) #0 {
ret <8 x i16> %res
}
-define void @sdiv_v16i16(<16 x i16>* %a) #0 {
+define void @sdiv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -229,91 +161,35 @@ define void @sdiv_v32i16(<32 x i16>* %a) #0 {
ret void
}
-define void @sdiv_v64i16(<64 x i16>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v64i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #48
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #32
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: asrd z1.h, p0/m, z1.h, #5
-; VBITS_GE_256-NEXT: asrd z0.h, p0/m, z0.h, #5
-; VBITS_GE_256-NEXT: asrd z2.h, p0/m, z2.h, #5
-; VBITS_GE_256-NEXT: asrd z3.h, p0/m, z3.h, #5
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: sdiv_v64i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: asrd z0.h, p0/m, z0.h, #5
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @sdiv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: sdiv_v64i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%res = sdiv <64 x i16> %op1, shufflevector (<64 x i16> insertelement (<64 x i16> poison, i16 32, i32 0), <64 x i16> poison, <64 x i32> zeroinitializer)
store <64 x i16> %res, <64 x i16>* %a
ret void
}
-define void @sdiv_v128i16(<128 x i16>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v128i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #96
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x11, #80
-; VBITS_GE_256-NEXT: mov x12, #32
-; VBITS_GE_256-NEXT: mov x13, #112
-; VBITS_GE_256-NEXT: mov x14, #64
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: asrd z1.h, p0/m, z1.h, #5
-; VBITS_GE_256-NEXT: asrd z0.h, p0/m, z0.h, #5
-; VBITS_GE_256-NEXT: asrd z3.h, p0/m, z3.h, #5
-; VBITS_GE_256-NEXT: asrd z2.h, p0/m, z2.h, #5
-; VBITS_GE_256-NEXT: asrd z5.h, p0/m, z5.h, #5
-; VBITS_GE_256-NEXT: asrd z4.h, p0/m, z4.h, #5
-; VBITS_GE_256-NEXT: asrd z6.h, p0/m, z6.h, #5
-; VBITS_GE_256-NEXT: asrd z7.h, p0/m, z7.h, #5
-; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: sdiv_v128i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: asrd z0.h, p0/m, z0.h, #5
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @sdiv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: sdiv_v128i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%res = sdiv <128 x i16> %op1, shufflevector (<128 x i16> insertelement (<128 x i16> poison, i16 32, i32 0), <128 x i16> poison, <128 x i32> zeroinitializer)
store <128 x i16> %res, <128 x i16>* %a
ret void
}
-define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) #0 {
+define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -325,7 +201,7 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) #0 {
ret <2 x i32> %res
}
-define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) #0 {
+define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -337,7 +213,7 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) #0 {
ret <4 x i32> %res
}
-define void @sdiv_v8i32(<8 x i32>* %a) #0 {
+define void @sdiv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -377,91 +253,35 @@ define void @sdiv_v16i32(<16 x i32>* %a) #0 {
ret void
}
-define void @sdiv_v32i32(<32 x i32>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v32i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: asrd z1.s, p0/m, z1.s, #5
-; VBITS_GE_256-NEXT: asrd z0.s, p0/m, z0.s, #5
-; VBITS_GE_256-NEXT: asrd z2.s, p0/m, z2.s, #5
-; VBITS_GE_256-NEXT: asrd z3.s, p0/m, z3.s, #5
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: sdiv_v32i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: asrd z0.s, p0/m, z0.s, #5
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @sdiv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: sdiv_v32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%res = sdiv <32 x i32> %op1, shufflevector (<32 x i32> insertelement (<32 x i32> poison, i32 32, i32 0), <32 x i32> poison, <32 x i32> zeroinitializer)
store <32 x i32> %res, <32 x i32>* %a
ret void
}
-define void @sdiv_v64i32(<64 x i32>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v64i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #48
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: mov x11, #40
-; VBITS_GE_256-NEXT: mov x12, #16
-; VBITS_GE_256-NEXT: mov x13, #56
-; VBITS_GE_256-NEXT: mov x14, #32
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: asrd z1.s, p0/m, z1.s, #5
-; VBITS_GE_256-NEXT: asrd z0.s, p0/m, z0.s, #5
-; VBITS_GE_256-NEXT: asrd z3.s, p0/m, z3.s, #5
-; VBITS_GE_256-NEXT: asrd z2.s, p0/m, z2.s, #5
-; VBITS_GE_256-NEXT: asrd z5.s, p0/m, z5.s, #5
-; VBITS_GE_256-NEXT: asrd z4.s, p0/m, z4.s, #5
-; VBITS_GE_256-NEXT: asrd z6.s, p0/m, z6.s, #5
-; VBITS_GE_256-NEXT: asrd z7.s, p0/m, z7.s, #5
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: sdiv_v64i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: asrd z0.s, p0/m, z0.s, #5
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @sdiv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: sdiv_v64i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%res = sdiv <64 x i32> %op1, shufflevector (<64 x i32> insertelement (<64 x i32> poison, i32 32, i32 0), <64 x i32> poison, <64 x i32> zeroinitializer)
store <64 x i32> %res, <64 x i32>* %a
ret void
}
-define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) #0 {
+define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -474,7 +294,7 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) #0 {
}
; Vector i64 sdiv are not legal for NEON so use SVE when available.
-define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) #0 {
+define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -486,7 +306,7 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) #0 {
ret <2 x i64> %res
}
-define void @sdiv_v4i64(<4 x i64>* %a) #0 {
+define void @sdiv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -526,84 +346,28 @@ define void @sdiv_v8i64(<8 x i64>* %a) #0 {
ret void
}
-define void @sdiv_v16i64(<16 x i64>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v16i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: asrd z1.d, p0/m, z1.d, #5
-; VBITS_GE_256-NEXT: asrd z0.d, p0/m, z0.d, #5
-; VBITS_GE_256-NEXT: asrd z2.d, p0/m, z2.d, #5
-; VBITS_GE_256-NEXT: asrd z3.d, p0/m, z3.d, #5
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: sdiv_v16i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: asrd z0.d, p0/m, z0.d, #5
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @sdiv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: sdiv_v16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%res = sdiv <16 x i64> %op1, shufflevector (<16 x i64> insertelement (<16 x i64> poison, i64 32, i32 0), <16 x i64> poison, <16 x i32> zeroinitializer)
store <16 x i64> %res, <16 x i64>* %a
ret void
}
-define void @sdiv_v32i64(<32 x i64>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v32i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: mov x12, #8
-; VBITS_GE_256-NEXT: mov x13, #28
-; VBITS_GE_256-NEXT: mov x14, #16
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: asrd z1.d, p0/m, z1.d, #5
-; VBITS_GE_256-NEXT: asrd z0.d, p0/m, z0.d, #5
-; VBITS_GE_256-NEXT: asrd z3.d, p0/m, z3.d, #5
-; VBITS_GE_256-NEXT: asrd z2.d, p0/m, z2.d, #5
-; VBITS_GE_256-NEXT: asrd z5.d, p0/m, z5.d, #5
-; VBITS_GE_256-NEXT: asrd z4.d, p0/m, z4.d, #5
-; VBITS_GE_256-NEXT: asrd z6.d, p0/m, z6.d, #5
-; VBITS_GE_256-NEXT: asrd z7.d, p0/m, z7.d, #5
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: sdiv_v32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: asrd z0.d, p0/m, z0.d, #5
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @sdiv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: sdiv_v32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%res = sdiv <32 x i64> %op1, shufflevector (<32 x i64> insertelement (<32 x i64> poison, i64 32, i32 0), <32 x i64> poison, <32 x i32> zeroinitializer)
store <32 x i64> %res, <32 x i64>* %a
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
index 322ec0eb0110..33877e17c766 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
; bigger than NEON. However, having no support opens us up to a code generator
; hang when expanding BUILD_VECTOR. Here we just validate the promblematic case
; successfully exits code generation.
-define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32> %b) #0 {
+define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32> %b) vscale_range(2,2) #0 {
; CHECK-LABEL: hang_when_merging_stores_after_legalisation:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
@@ -37,8 +37,8 @@ define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32
ret void
}
-; Ensure we don't crash when trying to lower a shuffle via and extract
-define void @crash_when_lowering_extract_shuffle(<32 x i32>* %dst, i1 %cond) #0 {
+; Ensure we don't crash when trying to lower a shuffle via an extract
+define void @crash_when_lowering_extract_shuffle(<32 x i32>* %dst, i1 %cond) vscale_range(2,2) #0 {
; CHECK-LABEL: crash_when_lowering_extract_shuffle:
; CHECK: // %bb.0:
; CHECK-NEXT: tbnz w1, #0, .LBB1_2
@@ -132,4 +132,4 @@ exit:
ret void
}
-attributes #0 = { vscale_range(2,2) "target-features"="+sve" }
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
index c3af3f250b2b..50d0941cc52c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
@@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
;
; Don't use SVE for 64-bit vectors.
-define <8 x i8> @splat_v8i8(i8 %a) #0 {
+define <8 x i8> @splat_v8i8(i8 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.8b, w0
@@ -35,7 +21,7 @@ define <8 x i8> @splat_v8i8(i8 %a) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <16 x i8> @splat_v16i8(i8 %a) #0 {
+define <16 x i8> @splat_v16i8(i8 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.16b, w0
@@ -45,7 +31,7 @@ define <16 x i8> @splat_v16i8(i8 %a) #0 {
ret <16 x i8> %splat
}
-define void @splat_v32i8(i8 %a, <32 x i8>* %b) #0 {
+define void @splat_v32i8(i8 %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
@@ -74,68 +60,32 @@ define void @splat_v64i8(i8 %a, <64 x i8>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.b, w0
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
%insert = insertelement <64 x i8> undef, i8 %a, i64 0
%splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
store <64 x i8> %splat, <64 x i8>* %b
ret void
}
-define void @splat_v128i8(i8 %a, <128 x i8>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v128i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w8, #96
-; VBITS_GE_256-NEXT: mov w9, #64
-; VBITS_GE_256-NEXT: mov w10, #32
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: mov z0.b, w0
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x10]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: splat_v128i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT: mov z0.b, w0
-; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @splat_v128i8(i8 %a, <128 x i8>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v128i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: mov z0.b, w0
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
%insert = insertelement <128 x i8> undef, i8 %a, i64 0
%splat = shufflevector <128 x i8> %insert, <128 x i8> undef, <128 x i32> zeroinitializer
store <128 x i8> %splat, <128 x i8>* %b
ret void
}
-define void @splat_v256i8(i8 %a, <256 x i8>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v256i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w8, #224
-; VBITS_GE_256-NEXT: mov w9, #192
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: mov z0.b, w0
-; VBITS_GE_256-NEXT: mov w10, #160
-; VBITS_GE_256-NEXT: mov w11, #128
-; VBITS_GE_256-NEXT: mov w12, #96
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8]
-; VBITS_GE_256-NEXT: mov w8, #64
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9]
-; VBITS_GE_256-NEXT: mov w9, #32
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x10]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x11]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x12]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: splat_v256i8:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT: mov z0.b, w0
-; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @splat_v256i8(i8 %a, <256 x i8>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v256i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: mov z0.b, w0
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
%insert = insertelement <256 x i8> undef, i8 %a, i64 0
%splat = shufflevector <256 x i8> %insert, <256 x i8> undef, <256 x i32> zeroinitializer
store <256 x i8> %splat, <256 x i8>* %b
@@ -143,7 +93,7 @@ define void @splat_v256i8(i8 %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <4 x i16> @splat_v4i16(i16 %a) #0 {
+define <4 x i16> @splat_v4i16(i16 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.4h, w0
@@ -154,7 +104,7 @@ define <4 x i16> @splat_v4i16(i16 %a) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <8 x i16> @splat_v8i16(i16 %a) #0 {
+define <8 x i16> @splat_v8i16(i16 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.8h, w0
@@ -164,7 +114,7 @@ define <8 x i16> @splat_v8i16(i16 %a) #0 {
ret <8 x i16> %splat
}
-define void @splat_v16i16(i16 %a, <16 x i16>* %b) #0 {
+define void @splat_v16i16(i16 %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -193,68 +143,32 @@ define void @splat_v32i16(i16 %a, <32 x i16>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.h, w0
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
%insert = insertelement <32 x i16> undef, i16 %a, i64 0
%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
store <32 x i16> %splat, <32 x i16>* %b
ret void
}
-define void @splat_v64i16(i16 %a, <64 x i16>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v64i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #48
-; VBITS_GE_256-NEXT: mov x9, #32
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov z0.h, w0
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: splat_v64i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: mov z0.h, w0
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @splat_v64i16(i16 %a, <64 x i16>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v64i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: mov z0.h, w0
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%insert = insertelement <64 x i16> undef, i16 %a, i64 0
%splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
store <64 x i16> %splat, <64 x i16>* %b
ret void
}
-define void @splat_v128i16(i16 %a, <128 x i16>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v128i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #112
-; VBITS_GE_256-NEXT: mov x9, #96
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov z0.h, w0
-; VBITS_GE_256-NEXT: mov x10, #80
-; VBITS_GE_256-NEXT: mov x11, #64
-; VBITS_GE_256-NEXT: mov x12, #48
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: mov x8, #32
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: splat_v128i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: mov z0.h, w0
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @splat_v128i16(i16 %a, <128 x i16>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v128i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: mov z0.h, w0
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%insert = insertelement <128 x i16> undef, i16 %a, i64 0
%splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
store <128 x i16> %splat, <128 x i16>* %b
@@ -262,7 +176,7 @@ define void @splat_v128i16(i16 %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x i32> @splat_v2i32(i32 %a) #0 {
+define <2 x i32> @splat_v2i32(i32 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.2s, w0
@@ -273,7 +187,7 @@ define <2 x i32> @splat_v2i32(i32 %a) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x i32> @splat_v4i32(i32 %a) #0 {
+define <4 x i32> @splat_v4i32(i32 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.4s, w0
@@ -283,7 +197,7 @@ define <4 x i32> @splat_v4i32(i32 %a) #0 {
ret <4 x i32> %splat
}
-define void @splat_v8i32(i32 %a, <8 x i32>* %b) #0 {
+define void @splat_v8i32(i32 %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -312,68 +226,32 @@ define void @splat_v16i32(i32 %a, <16 x i32>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.s, w0
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
%insert = insertelement <16 x i32> undef, i32 %a, i64 0
%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
store <16 x i32> %splat, <16 x i32>* %b
ret void
}
-define void @splat_v32i32(i32 %a, <32 x i32>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v32i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov z0.s, w0
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: splat_v32i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: mov z0.s, w0
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @splat_v32i32(i32 %a, <32 x i32>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: mov z0.s, w0
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%insert = insertelement <32 x i32> undef, i32 %a, i64 0
%splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
store <32 x i32> %splat, <32 x i32>* %b
ret void
}
-define void @splat_v64i32(i32 %a, <64 x i32>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v64i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #56
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov z0.s, w0
-; VBITS_GE_256-NEXT: mov x10, #40
-; VBITS_GE_256-NEXT: mov x11, #32
-; VBITS_GE_256-NEXT: mov x12, #24
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: splat_v64i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: mov z0.s, w0
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @splat_v64i32(i32 %a, <64 x i32>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v64i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: mov z0.s, w0
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%insert = insertelement <64 x i32> undef, i32 %a, i64 0
%splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
store <64 x i32> %splat, <64 x i32>* %b
@@ -381,7 +259,7 @@ define void @splat_v64i32(i32 %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x i64> @splat_v1i64(i64 %a) #0 {
+define <1 x i64> @splat_v1i64(i64 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, x0
@@ -392,7 +270,7 @@ define <1 x i64> @splat_v1i64(i64 %a) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x i64> @splat_v2i64(i64 %a) #0 {
+define <2 x i64> @splat_v2i64(i64 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.2d, x0
@@ -402,7 +280,7 @@ define <2 x i64> @splat_v2i64(i64 %a) #0 {
ret <2 x i64> %splat
}
-define void @splat_v4i64(i64 %a, <4 x i64>* %b) #0 {
+define void @splat_v4i64(i64 %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -431,68 +309,32 @@ define void @splat_v8i64(i64 %a, <8 x i64>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.d, x0
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
%insert = insertelement <8 x i64> undef, i64 %a, i64 0
%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
store <8 x i64> %splat, <8 x i64>* %b
ret void
}
-define void @splat_v16i64(i64 %a, <16 x i64>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v16i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov z0.d, x0
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: splat_v16i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: mov z0.d, x0
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @splat_v16i64(i64 %a, <16 x i64>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: mov z0.d, x0
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%insert = insertelement <16 x i64> undef, i64 %a, i64 0
%splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
store <16 x i64> %splat, <16 x i64>* %b
ret void
}
-define void @splat_v32i64(i64 %a, <32 x i64>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v32i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #28
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov z0.d, x0
-; VBITS_GE_256-NEXT: mov x10, #20
-; VBITS_GE_256-NEXT: mov x11, #16
-; VBITS_GE_256-NEXT: mov x12, #12
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: splat_v32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: mov z0.d, x0
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @splat_v32i64(i64 %a, <32 x i64>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: mov z0.d, x0
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%insert = insertelement <32 x i64> undef, i64 %a, i64 0
%splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
store <32 x i64> %splat, <32 x i64>* %b
@@ -504,7 +346,7 @@ define void @splat_v32i64(i64 %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
-define <4 x half> @splat_v4f16(half %a) #0 {
+define <4 x half> @splat_v4f16(half %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
@@ -516,7 +358,7 @@ define <4 x half> @splat_v4f16(half %a) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <8 x half> @splat_v8f16(half %a) #0 {
+define <8 x half> @splat_v8f16(half %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
@@ -527,7 +369,7 @@ define <8 x half> @splat_v8f16(half %a) #0 {
ret <8 x half> %splat
}
-define void @splat_v16f16(half %a, <16 x half>* %b) #0 {
+define void @splat_v16f16(half %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
@@ -559,72 +401,34 @@ define void @splat_v32f16(half %a, <32 x half>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.h, h0
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
%insert = insertelement <32 x half> undef, half %a, i64 0
%splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer
store <32 x half> %splat, <32 x half>* %b
ret void
}
-define void @splat_v64f16(half %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v64f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #48
-; VBITS_GE_256-NEXT: mov x9, #32
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov z0.h, h0
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: splat_v64f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: // kill: def $h0 killed $h0 def $z0
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: mov z0.h, h0
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @splat_v64f16(half %a, <64 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: mov z0.h, h0
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <64 x half> undef, half %a, i64 0
%splat = shufflevector <64 x half> %insert, <64 x half> undef, <64 x i32> zeroinitializer
store <64 x half> %splat, <64 x half>* %b
ret void
}
-define void @splat_v128f16(half %a, <128 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v128f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #112
-; VBITS_GE_256-NEXT: mov x9, #96
-; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x10, #80
-; VBITS_GE_256-NEXT: mov z0.h, h0
-; VBITS_GE_256-NEXT: mov x11, #64
-; VBITS_GE_256-NEXT: mov x12, #48
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: mov x8, #32
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: splat_v128f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: // kill: def $h0 killed $h0 def $z0
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: mov z0.h, h0
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @splat_v128f16(half %a, <128 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v128f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: mov z0.h, h0
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <128 x half> undef, half %a, i64 0
%splat = shufflevector <128 x half> %insert, <128 x half> undef, <128 x i32> zeroinitializer
store <128 x half> %splat, <128 x half>* %b
@@ -632,7 +436,7 @@ define void @splat_v128f16(half %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) #0 {
+define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
@@ -644,7 +448,7 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) #0 {
+define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
@@ -655,7 +459,7 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) #0 {
ret <4 x float> %splat
}
-define void @splat_v8f32(float %a, <8 x float>* %b) #0 {
+define void @splat_v8f32(float %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
@@ -687,72 +491,34 @@ define void @splat_v16f32(float %a, <16 x float>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.s, s0
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
%insert = insertelement <16 x float> undef, float %a, i64 0
%splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer
store <16 x float> %splat, <16 x float>* %b
ret void
}
-define void @splat_v32f32(float %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov z0.s, s0
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: splat_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: // kill: def $s0 killed $s0 def $z0
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: mov z0.s, s0
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @splat_v32f32(float %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: mov z0.s, s0
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <32 x float> undef, float %a, i64 0
%splat = shufflevector <32 x float> %insert, <32 x float> undef, <32 x i32> zeroinitializer
store <32 x float> %splat, <32 x float>* %b
ret void
}
-define void @splat_v64f32(float %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #56
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x10, #40
-; VBITS_GE_256-NEXT: mov z0.s, s0
-; VBITS_GE_256-NEXT: mov x11, #32
-; VBITS_GE_256-NEXT: mov x12, #24
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: splat_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: // kill: def $s0 killed $s0 def $z0
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: mov z0.s, s0
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @splat_v64f32(float %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: mov z0.s, s0
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <64 x float> undef, float %a, i64 0
%splat = shufflevector <64 x float> %insert, <64 x float> undef, <64 x i32> zeroinitializer
store <64 x float> %splat, <64 x float>* %b
@@ -760,7 +526,7 @@ define void @splat_v64f32(float %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
-define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) #0 {
+define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
@@ -770,7 +536,7 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
-define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) #0 {
+define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@@ -781,7 +547,7 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) #0 {
ret <2 x double> %splat
}
-define void @splat_v4f64(double %a, <4 x double>* %b) #0 {
+define void @splat_v4f64(double %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -813,72 +579,34 @@ define void @splat_v8f64(double %a, <8 x double>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.d, d0
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
%insert = insertelement <8 x double> undef, double %a, i64 0
%splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer
store <8 x double> %splat, <8 x double>* %b
ret void
}
-define void @splat_v16f64(double %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov z0.d, d0
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: splat_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: // kill: def $d0 killed $d0 def $z0
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: mov z0.d, d0
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @splat_v16f64(double %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: mov z0.d, d0
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <16 x double> undef, double %a, i64 0
%splat = shufflevector <16 x double> %insert, <16 x double> undef, <16 x i32> zeroinitializer
store <16 x double> %splat, <16 x double>* %b
ret void
}
-define void @splat_v32f64(double %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #28
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x10, #20
-; VBITS_GE_256-NEXT: mov z0.d, d0
-; VBITS_GE_256-NEXT: mov x11, #16
-; VBITS_GE_256-NEXT: mov x12, #12
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #4
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: splat_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: // kill: def $d0 killed $d0 def $z0
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: mov z0.d, d0
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @splat_v32f64(double %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: mov z0.d, d0
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <32 x double> undef, double %a, i64 0
%splat = shufflevector <32 x double> %insert, <32 x double> undef, <32 x i32> zeroinitializer
store <32 x double> %splat, <32 x double>* %b
@@ -889,88 +617,52 @@ define void @splat_v32f64(double %a, <32 x double>* %b) #0 {
; DUP (integer immediate)
;
-define void @splat_imm_v64i8(<64 x i8>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v64i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w8, #32
-; VBITS_GE_256-NEXT: mov z0.b, #1 // =0x1
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v64i8:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: mov z0.b, #1 // =0x1
-; VBITS_GE_512-NEXT: ptrue p0.b, vl64
-; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
+define void @splat_imm_v64i8(<64 x i8>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v64i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.b, #1 // =0x1
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <64 x i8> undef, i8 1, i64 0
%splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
store <64 x i8> %splat, <64 x i8>* %a
ret void
}
-define void @splat_imm_v32i16(<32 x i16>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v32i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov z0.h, #2 // =0x2
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v32i16:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: mov z0.h, #2 // =0x2
-; VBITS_GE_512-NEXT: ptrue p0.h, vl32
-; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
+define void @splat_imm_v32i16(<32 x i16>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v32i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, #2 // =0x2
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <32 x i16> undef, i16 2, i64 0
%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
store <32 x i16> %splat, <32 x i16>* %a
ret void
}
-define void @splat_imm_v16i32(<16 x i32>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v16i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov z0.s, #3 // =0x3
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v16i32:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: mov z0.s, #3 // =0x3
-; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
+define void @splat_imm_v16i32(<16 x i32>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.s, #3 // =0x3
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <16 x i32> undef, i32 3, i64 0
%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
store <16 x i32> %splat, <16 x i32>* %a
ret void
}
-define void @splat_imm_v8i64(<8 x i64>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v8i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: mov z0.d, #4 // =0x4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v8i64:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: mov z0.d, #4 // =0x4
-; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
+define void @splat_imm_v8i64(<8 x i64>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v8i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.d, #4 // =0x4
+; CHECK-NEXT: ptrue p0.d, vl8
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <8 x i64> undef, i64 4, i64 0
%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
store <8 x i64> %splat, <8 x i64>* %a
@@ -981,69 +673,43 @@ define void @splat_imm_v8i64(<8 x i64>* %a) #0 {
; DUP (floating-point immediate)
;
-define void @splat_imm_v32f16(<32 x half>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v32f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: fmov z0.h, #5.00000000
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v32f16:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: fmov z0.h, #5.00000000
-; VBITS_GE_512-NEXT: ptrue p0.h, vl32
-; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
+define void @splat_imm_v32f16(<32 x half>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v32f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z0.h, #5.00000000
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <32 x half> undef, half 5.0, i64 0
%splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer
store <32 x half> %splat, <32 x half>* %a
ret void
}
-define void @splat_imm_v16f32(<16 x float>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v16f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: fmov z0.s, #6.00000000
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v16f32:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: fmov z0.s, #6.00000000
-; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
+define void @splat_imm_v16f32(<16 x float>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v16f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z0.s, #6.00000000
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <16 x float> undef, float 6.0, i64 0
%splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer
store <16 x float> %splat, <16 x float>* %a
ret void
}
-define void @splat_imm_v8f64(<8 x double>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v8f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: fmov z0.d, #7.00000000
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v8f64:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: fmov z0.d, #7.00000000
-; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
+define void @splat_imm_v8f64(<8 x double>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v8f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z0.d, #7.00000000
+; CHECK-NEXT: ptrue p0.d, vl8
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
%insert = insertelement <8 x double> undef, double 7.0, i64 0
%splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer
store <8 x double> %splat, <8 x double>* %a
ret void
}
+
attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
index 225edeb5b2d9..0b4c08d79853 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
@@ -1,35 +1,29 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
-
-; VBYTES represents the useful byte size of a vector register from the code
-; generator's point of view. It is clamped to power-of-2 values because
-; only power-of-2 vector lengths are considered legal, regardless of the
-; user specified vector length.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
; Don't use SVE for 64-bit vectors.
define void @store_v2f32(<2 x float>* %a) #0 {
; CHECK-LABEL: store_v2f32:
-; CHECK: str xzr, [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: str xzr, [x0]
+; CHECK-NEXT: ret
store <2 x float> zeroinitializer, <2 x float>* %a
ret void
}
@@ -37,66 +31,148 @@ define void @store_v2f32(<2 x float>* %a) #0 {
; Don't use SVE for 128-bit vectors.
define void @store_v4f32(<4 x float>* %a) #0 {
; CHECK-LABEL: store_v4f32:
-; CHECK: stp xzr, xzr, [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp xzr, xzr, [x0]
+; CHECK-NEXT: ret
store <4 x float> zeroinitializer, <4 x float>* %a
ret void
}
define void @store_v8f32(<8 x float>* %a) #0 {
; CHECK-LABEL: store_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: mov z0.s, #0 // =0x0
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
store <8 x float> zeroinitializer, <8 x float>* %a
ret void
}
define void @store_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: store_v16f32:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
-; VBITS_LE_256-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
-; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A1]], lsl #2]
-; CHECK: ret
+; VBITS_GE_256-LABEL: store_v16f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: store_v16f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
+;
+; VBITS_GE_1024-LABEL: store_v16f32:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
+; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT: ret
+;
+; VBITS_GE_2048-LABEL: store_v16f32:
+; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.s, vl16
+; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0
+; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_2048-NEXT: ret
store <16 x float> zeroinitializer, <16 x float>* %a
ret void
}
define void @store_v32f32(<32 x float>* %a) #0 {
-; CHECK-LABEL: store_v32f32:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
-; VBITS_LE_512-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
-; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A1]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]]
-; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A2]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]]
-; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A3]], lsl #2]
-; CHECK: ret
+; VBITS_GE_256-LABEL: store_v32f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #24
+; VBITS_GE_256-NEXT: mov x9, #16
+; VBITS_GE_256-NEXT: mov x10, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: store_v32f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: mov x8, #16
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_512-NEXT: ret
+;
+; VBITS_GE_1024-LABEL: store_v32f32:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT: ret
+;
+; VBITS_GE_2048-LABEL: store_v32f32:
+; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
+; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0
+; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_2048-NEXT: ret
store <32 x float> zeroinitializer, <32 x float>* %a
ret void
}
define void @store_v64f32(<64 x float>* %a) #0 {
-; CHECK-LABEL: store_v64f32:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
-; VBITS_LE_1024-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
-; VBITS_LE_1024-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A1]], lsl #2]
-; VBITS_LE_512-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]]
-; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A2]], lsl #2]
-; VBITS_LE_512-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]]
-; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A3]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A4:[0-9]+]], #[[#mul(div(VBYTES,4),4)]]
-; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A4]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A5:[0-9]+]], #[[#mul(div(VBYTES,4),5)]]
-; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A5]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A6:[0-9]+]], #[[#mul(div(VBYTES,4),6)]]
-; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A6]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A7:[0-9]+]], #[[#mul(div(VBYTES,4),7)]]
-; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A7]], lsl #2]
-; CHECK: ret
+; VBITS_GE_256-LABEL: store_v64f32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #56
+; VBITS_GE_256-NEXT: mov x9, #48
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0
+; VBITS_GE_256-NEXT: mov x10, #40
+; VBITS_GE_256-NEXT: mov x11, #32
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: mov x8, #24
+; VBITS_GE_256-NEXT: mov x12, #16
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: mov x9, #8
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x12, lsl #2]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: store_v64f32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: mov x8, #48
+; VBITS_GE_512-NEXT: mov x9, #32
+; VBITS_GE_512-NEXT: mov x10, #16
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
+;
+; VBITS_GE_1024-LABEL: store_v64f32:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: mov x8, #32
+; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_1024-NEXT: ret
+;
+; VBITS_GE_2048-LABEL: store_v64f32:
+; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
+; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0
+; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_2048-NEXT: ret
store <64 x float> zeroinitializer, <64 x float>* %a
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
index 99a414f6b66c..2ce5a2d27989 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
@@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 -aarch64-enable-atomic-cfg-tidy=false < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; Test we can code generater patterns of the form:
; fixed_length_vector = ISD::EXTRACT_SUBVECTOR scalable_vector, 0
@@ -28,7 +14,7 @@
target triple = "aarch64-unknown-linux-gnu"
-define void @subvector_v8i16(<8 x i16> *%in, <8 x i16>* %out) #0 {
+define void @subvector_v8i16(<8 x i16> *%in, <8 x i16>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -42,7 +28,7 @@ bb1:
ret void
}
-define void @subvector_v16i16(<16 x i16> *%in, <16 x i16>* %out) #0 {
+define void @subvector_v16i16(<16 x i16> *%in, <16 x i16>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -82,29 +68,13 @@ bb1:
ret void
}
-define void @subvector_v64i16(<64 x i16> *%in, <64 x i16>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v64i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #48
-; VBITS_GE_256-NEXT: mov x9, #32
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: subvector_v64i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @subvector_v64i16(<64 x i16> *%in, <64 x i16>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: subvector_v64i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x i16>, <64 x i16>* %in
br label %bb1
@@ -113,7 +83,7 @@ bb1:
ret void
}
-define void @subvector_v8i32(<8 x i32> *%in, <8 x i32>* %out) #0 {
+define void @subvector_v8i32(<8 x i32> *%in, <8 x i32>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -153,29 +123,13 @@ bb1:
ret void
}
-define void @subvector_v32i32(<32 x i32> *%in, <32 x i32>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v32i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: subvector_v32i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @subvector_v32i32(<32 x i32> *%in, <32 x i32>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: subvector_v32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %in
br label %bb1
@@ -184,41 +138,13 @@ bb1:
ret void
}
-define void @subvector_v64i32(<64 x i32> *%in, <64 x i32>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v64i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #56
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: mov x10, #40
-; VBITS_GE_256-NEXT: mov x11, #32
-; VBITS_GE_256-NEXT: mov x12, #24
-; VBITS_GE_256-NEXT: mov x13, #16
-; VBITS_GE_256-NEXT: mov x14, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: subvector_v64i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @subvector_v64i32(<64 x i32> *%in, <64 x i32>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: subvector_v64i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x i32>, <64 x i32>* %in
br label %bb1
@@ -228,23 +154,16 @@ bb1:
}
-define void @subvector_v8i64(<8 x i64> *%in, <8 x i64>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v8i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: subvector_v8i64:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_512-NEXT: ret
+define void @subvector_v8i64(<8 x i64> *%in, <8 x i64>* %out) vscale_range(2,0) #0 {
+; CHECK-LABEL: subvector_v8i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #4
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
+; CHECK-NEXT: st1d { z1.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %in
br label %bb1
@@ -253,29 +172,13 @@ bb1:
ret void
}
-define void @subvector_v16i64(<16 x i64> *%in, <16 x i64>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v16i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: subvector_v16i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @subvector_v16i64(<16 x i64> *%in, <16 x i64>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: subvector_v16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %in
br label %bb1
@@ -284,41 +187,13 @@ bb1:
ret void
}
-define void @subvector_v32i64(<32 x i64> *%in, <32 x i64>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v32i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #28
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #20
-; VBITS_GE_256-NEXT: mov x11, #16
-; VBITS_GE_256-NEXT: mov x12, #12
-; VBITS_GE_256-NEXT: mov x13, #8
-; VBITS_GE_256-NEXT: mov x14, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: subvector_v32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @subvector_v32i64(<32 x i64> *%in, <32 x i64>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: subvector_v32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %in
br label %bb1
@@ -327,7 +202,7 @@ bb1:
ret void
}
-define void @subvector_v8f16(<8 x half> *%in, <8 x half>* %out) #0 {
+define void @subvector_v8f16(<8 x half> *%in, <8 x half>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@@ -341,7 +216,7 @@ bb1:
ret void
}
-define void @subvector_v16f16(<16 x half> *%in, <16 x half>* %out) #0 {
+define void @subvector_v16f16(<16 x half> *%in, <16 x half>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -381,29 +256,13 @@ bb1:
ret void
}
-define void @subvector_v64f16(<64 x half> *%in, <64 x half>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v64f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #48
-; VBITS_GE_256-NEXT: mov x9, #32
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: subvector_v64f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @subvector_v64f16(<64 x half> *%in, <64 x half>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: subvector_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x half>, <64 x half>* %in
br label %bb1
@@ -412,7 +271,7 @@ bb1:
ret void
}
-define void @subvector_v8f32(<8 x float> *%in, <8 x float>* %out) #0 {
+define void @subvector_v8f32(<8 x float> *%in, <8 x float>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -452,29 +311,13 @@ bb1:
ret void
}
-define void @subvector_v32f32(<32 x float> *%in, <32 x float>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #24
-; VBITS_GE_256-NEXT: mov x9, #16
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: subvector_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @subvector_v32f32(<32 x float> *%in, <32 x float>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: subvector_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x float>, <32 x float>* %in
br label %bb1
@@ -483,41 +326,13 @@ bb1:
ret void
}
-define void @subvector_v64f32(<64 x float> *%in, <64 x float>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #56
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: mov x10, #40
-; VBITS_GE_256-NEXT: mov x11, #32
-; VBITS_GE_256-NEXT: mov x12, #24
-; VBITS_GE_256-NEXT: mov x13, #16
-; VBITS_GE_256-NEXT: mov x14, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: subvector_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @subvector_v64f32(<64 x float> *%in, <64 x float>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: subvector_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x float>, <64 x float>* %in
br label %bb1
@@ -550,29 +365,13 @@ bb1:
ret void
}
-define void @subvector_v16f64(<16 x double> *%in, <16 x double>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #12
-; VBITS_GE_256-NEXT: mov x9, #8
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: subvector_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT: ret
+define void @subvector_v16f64(<16 x double> *%in, <16 x double>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: subvector_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <16 x double>, <16 x double>* %in
br label %bb1
@@ -581,41 +380,13 @@ bb1:
ret void
}
-define void @subvector_v32f64(<32 x double> *%in, <32 x double>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #28
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #20
-; VBITS_GE_256-NEXT: mov x11, #16
-; VBITS_GE_256-NEXT: mov x12, #12
-; VBITS_GE_256-NEXT: mov x13, #8
-; VBITS_GE_256-NEXT: mov x14, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: subvector_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT: ret
+define void @subvector_v32f64(<32 x double> *%in, <32 x double>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: subvector_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x double>, <32 x double>* %in
br label %bb1
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
index a32d5ce78f17..9b56968bbcb3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
@@ -1,43 +1,30 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
-define void @store_trunc_v2i64i8(<2 x i64>* %ap, <2 x i8>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v2i64i8
-; CHECK: ldr q[[Q0:[0-9]+]], [x0]
-; CHECK: ptrue p[[P0:[0-9]+]].d, vl2
-; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x1]
-; CHECK-NEXT: ret
+define void @store_trunc_v2i64i8(<2 x i64>* %ap, <2 x i8>* %dest) vscale_range(2,0) #0 {
+; CHECK-LABEL: store_trunc_v2i64i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: st1b { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <2 x i64>, <2 x i64>* %ap
%val = trunc <2 x i64> %a to <2 x i8>
store <2 x i8> %val, <2 x i8>* %dest
ret void
}
-define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v4i64i8
-; CHECK: ptrue p[[P0:[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
-; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x1]
-; CHECK-NEXT: ret
+define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) vscale_range(2,0) #0 {
+; CHECK-LABEL: store_trunc_v4i64i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %ap
%val = trunc <4 x i64> %a to <4 x i8>
store <4 x i8> %val, <4 x i8>* %dest
@@ -45,48 +32,52 @@ define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) #0 {
}
define void @store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i8>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v8i64i8:
-; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4
-; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s
-; VBITS_EQ_256-DAG: splice [[WORDS:z[0-9]+]].s, [[PG2]], [[WORDS_LO]].s, [[WORDS_HI]].s
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: st1b { [[WORDS]].s }, [[PG3]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: store_trunc_v8i64i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl4
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: st1b { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: store_trunc_v8i64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %ap
%val = trunc <8 x i64> %a to <8 x i8>
store <8 x i8> %val, <8 x i8>* %dest
ret void
}
-define void @store_trunc_v16i64i8(<16 x i64>* %ap, <16 x i8>* %dest) #0 {
+define void @store_trunc_v16i64i8(<16 x i64>* %ap, <16 x i8>* %dest) vscale_range(8,0) #0 {
; CHECK-LABEL: store_trunc_v16i64i8:
-; VBITS_GE_1024: ptrue p[[P0:[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %ap
%val = trunc <16 x i64> %a to <16 x i8>
store <16 x i8> %val, <16 x i8>* %dest
ret void
}
-define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) #0 {
+define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) vscale_range(16,0) #0 {
; CHECK-LABEL: store_trunc_v32i64i8:
-; VBITS_GE_2048: ptrue p[[P0:[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %ap
%val = trunc <32 x i64> %a to <32 x i8>
store <32 x i8> %val, <32 x i8>* %dest
@@ -94,25 +85,27 @@ define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) #0 {
}
define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v8i64i16:
-; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P0]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
; Currently does not use the truncating store
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s
-; VBITS_EQ_256-DAG: uzp1 z[[HALFS_LO:[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h
-; VBITS_EQ_256-DAG: uzp1 z[[HALFS_HI:[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h
-; VBITS_EQ_256-NEXT: mov v[[HALFS_LO]].d[1], v[[HALFS_HI]].d[0]
-; VBITS_EQ_256-NEXT: str q[[HALFS_LO]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: store_trunc_v8i64i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT: str q1, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: store_trunc_v8i64i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %ap
%val = trunc <8 x i64> %a to <8 x i16>
store <8 x i16> %val, <8 x i16>* %dest
@@ -120,24 +113,26 @@ define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 {
}
define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v8i64i32:
-; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P0]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4
-; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s
-; VBITS_EQ_256-DAG: splice [[WORDS:z[0-9]+]].s, [[PG1]], [[WORDS_LO]].s, [[WORDS_HI]].s
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: st1w { [[WORDS]].s }, [[PG3]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: store_trunc_v8i64i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl4
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: store_trunc_v8i64i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %ap
%val = trunc <8 x i64> %a to <8 x i32>
store <8 x i32> %val, <8 x i32>* %dest
@@ -145,25 +140,27 @@ define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 {
}
define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v16i32i8:
-; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P0]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
; Currently does not use the truncating store
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[WORDS_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[WORDS_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: uzp1 [[HALFS_LO:z[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h
-; VBITS_EQ_256-DAG: uzp1 [[HALFS_HI:z[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h
-; VBITS_EQ_256-DAG: uzp1 z[[BYTES_LO:[0-9]+]].b, [[HALFS_LO]].b, [[HALFS_LO]].b
-; VBITS_EQ_256-DAG: uzp1 z[[BYTES_HI:[0-9]+]].b, [[HALFS_HI]].b, [[HALFS_HI]].b
-; VBITS_EQ_256-NEXT: mov v[[BYTES_LO]].d[1], v[[BYTES_HI]].d[0]
-; VBITS_EQ_256-NEXT: str q[[BYTES_LO]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: store_trunc_v16i32i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT: str q1, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: store_trunc_v16i32i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1b { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %ap
%val = trunc <16 x i32> %a to <16 x i8>
store <16 x i8> %val, <16 x i8>* %dest
@@ -171,24 +168,26 @@ define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 {
}
define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v16i32i16:
-; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P0]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[WORDS_LO:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[WORDS_HI:z[0-9]+]].s }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].h, vl8
-; VBITS_EQ_256-DAG: uzp1 [[HALFS_LO:z[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h
-; VBITS_EQ_256-DAG: uzp1 [[HALFS_HI:z[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h
-; VBITS_EQ_256-DAG: splice [[HALFS:z[0-9]+]].h, [[PG2]], [[HALFS_LO]].h, [[HALFS_HI]].h
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: st1h { [[HALFS]].h }, [[PG3]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: store_trunc_v16i32i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.h, vl8
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: store_trunc_v16i32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %ap
%val = trunc <16 x i32> %a to <16 x i16>
store <16 x i16> %val, <16 x i16>* %dest
@@ -196,24 +195,26 @@ define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 {
}
define void @store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i8>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v32i16i8:
-; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P0]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[HALFS_LO:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HALFS_HI:z[0-9]+]].h }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].b, vl16
-; VBITS_EQ_256-DAG: uzp1 [[BYTES_LO:z[0-9]+]].b, [[HALFS_LO]].b, [[HALFS_LO]].b
-; VBITS_EQ_256-DAG: uzp1 [[BYTES_HI:z[0-9]+]].b, [[HALFS_HI]].b, [[HALFS_HI]].b
-; VBITS_EQ_256-DAG: splice [[BYTES:z[0-9]+]].b, [[PG2]], [[BYTES_LO]].b, [[BYTES_HI]].b
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].b, vl32
-; VBITS_EQ_256-NEXT: st1b { [[BYTES]].b }, [[PG3]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: store_trunc_v32i16i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.b, vl16
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: store_trunc_v32i16i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1b { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap
%val = trunc <32 x i16> %a to <32 x i8>
store <32 x i8> %val, <32 x i8>* %dest
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll
index 275716e06c23..8850bd4e84cb 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll
@@ -1,35 +1,22 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: z{0-9}
-
;
; truncate i16 -> i8
;
-define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) #0 {
+define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v16i16_v16i8:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %in
%b = trunc <16 x i16> %a to <16 x i8>
ret <16 x i8> %b
@@ -37,11 +24,30 @@ define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) #0 {
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i16_v32i8(<32 x i16>* %in, <32 x i8>* %out) #0 {
-; CHECK-LABEL: trunc_v32i16_v32i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_512: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
+; VBITS_GE_256-LABEL: trunc_v32i16_v32i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.b, vl16
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: add z0.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: trunc_v32i16_v32i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.b, vl32
+; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %in
%b = trunc <32 x i16> %a to <32 x i8>
%c = add <32 x i8> %b, %b
@@ -50,12 +56,16 @@ define void @trunc_v32i16_v32i8(<32 x i16>* %in, <32 x i8>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) #0 {
+define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v64i16_v64i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_1024: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x i16>, <64 x i16>* %in
%b = trunc <64 x i16> %a to <64 x i8>
%c = add <64 x i8> %b, %b
@@ -64,12 +74,16 @@ define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) #0 {
+define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v128i16_v128i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <128 x i16>, <128 x i16>* %in
%b = trunc <128 x i16> %a to <128 x i8>
%c = add <128 x i8> %b, %b
@@ -81,38 +95,60 @@ define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) #0 {
; truncate i32 -> i8
;
-define <8 x i8> @trunc_v8i32_v8i8(<8 x i32>* %in) #0 {
+define <8 x i8> @trunc_v8i32_v8i8(<8 x i32>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v8i32_v8i8:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; CHECK-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %in
%b = trunc <8 x i32> %a to <8 x i8>
ret <8 x i8> %b
}
define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %in) #0 {
-; CHECK-LABEL: trunc_v16i32_v16i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_512-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: trunc_v16i32_v16i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: uzp1 z2.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uzp1 z0.b, z1.b, z1.b
+; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: trunc_v16i32_v16i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %in
%b = trunc <16 x i32> %a to <16 x i8>
ret <16 x i8> %b
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) #0 {
+define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v32i32_v32i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_1024: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_1024: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %in
%b = trunc <32 x i32> %a to <32 x i8>
%c = add <32 x i8> %b, %b
@@ -121,13 +157,17 @@ define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) #0 {
+define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v64i32_v64i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.b, vl64
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x i32>, <64 x i32>* %in
%b = trunc <64 x i32> %a to <64 x i8>
%c = add <64 x i8> %b, %b
@@ -139,12 +179,14 @@ define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) #0 {
; truncate i32 -> i16
;
-define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) #0 {
+define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v8i32_v8i16:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %in
%b = trunc <8 x i32> %a to <8 x i16>
ret <8 x i16> %b
@@ -152,11 +194,30 @@ define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) #0 {
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v16i32_v16i16(<16 x i32>* %in, <16 x i16>* %out) #0 {
-; CHECK-LABEL: trunc_v16i32_v16i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_512: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
+; VBITS_GE_256-LABEL: trunc_v16i32_v16i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.h, vl8
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: add z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: trunc_v16i32_v16i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %in
%b = trunc <16 x i32> %a to <16 x i16>
%c = add <16 x i16> %b, %b
@@ -165,12 +226,16 @@ define void @trunc_v16i32_v16i16(<16 x i32>* %in, <16 x i16>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) #0 {
+define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v32i32_v32i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_1024: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: add z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %in
%b = trunc <32 x i32> %a to <32 x i16>
%c = add <32 x i16> %b, %b
@@ -179,12 +244,16 @@ define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) #0 {
+define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v64i32_v64i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_2048: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: add z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <64 x i32>, <64 x i32>* %in
%b = trunc <64 x i32> %a to <64 x i16>
%c = add <64 x i16> %b, %b
@@ -197,53 +266,78 @@ define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) #0 {
;
; NOTE: v4i8 is not legal so result i8 elements are held within i16 containers.
-define <4 x i8> @trunc_v4i64_v4i8(<4 x i64>* %in) #0 {
+define <4 x i8> @trunc_v4i64_v4i8(<4 x i64>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v4i64_v4i8:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %in
%b = trunc <4 x i64> %a to <4 x i8>
ret <4 x i8> %b
}
define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %in) #0 {
-; CHECK-LABEL: trunc_v8i64_v8i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_512-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_512-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: trunc_v8i64_v8i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl4
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: trunc_v8i64_v8i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %in
%b = trunc <8 x i64> %a to <8 x i8>
ret <8 x i8> %b
}
-define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %in) #0 {
+define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %in) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v16i64_v16i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_1024-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_1024-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_1024-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %in
%b = trunc <16 x i64> %a to <16 x i8>
ret <16 x i8> %b
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) #0 {
+define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v32i64_v32i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.b, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: add z0.b, z0.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %in
%b = trunc <32 x i64> %a to <32 x i8>
%c = add <32 x i8> %b, %b
@@ -255,38 +349,60 @@ define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) #0 {
; truncate i64 -> i16
;
-define <4 x i16> @trunc_v4i64_v4i16(<4 x i64>* %in) #0 {
+define <4 x i16> @trunc_v4i64_v4i16(<4 x i64>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v4i64_v4i16:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %in
%b = trunc <4 x i64> %a to <4 x i16>
ret <4 x i16> %b
}
define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %in) #0 {
-; CHECK-LABEL: trunc_v8i64_v8i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_512-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: trunc_v8i64_v8i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
+; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: trunc_v8i64_v8i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %in
%b = trunc <8 x i64> %a to <8 x i16>
ret <8 x i16> %b
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) #0 {
+define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v16i64_v16i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_1024: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.h, vl16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: add z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %in
%b = trunc <16 x i64> %a to <16 x i16>
%c = add <16 x i16> %b, %b
@@ -295,13 +411,17 @@ define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) #0 {
+define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v32i64_v32i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_2048: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.h, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: add z0.h, z0.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %in
%b = trunc <32 x i64> %a to <32 x i16>
%c = add <32 x i16> %b, %b
@@ -313,12 +433,14 @@ define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) #0 {
; truncate i64 -> i32
;
-define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) #0 {
+define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v4i64_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: uzp1 z0.s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %in
%b = trunc <4 x i64> %a to <4 x i32>
ret <4 x i32> %b
@@ -326,11 +448,30 @@ define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) #0 {
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 {
-; CHECK-LABEL: trunc_v8i64_v8i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_512: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s
+; VBITS_GE_256-LABEL: trunc_v8i64_v8i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: ptrue p0.d, vl4
+; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ptrue p0.s, vl4
+; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: add z0.s, z1.s, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_256-NEXT: ret
+;
+; VBITS_GE_512-LABEL: trunc_v8i64_v8i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.d, vl8
+; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %in
%b = trunc <8 x i64> %a to <8 x i32>
%c = add <8 x i32> %b, %b
@@ -339,12 +480,16 @@ define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) #0 {
+define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v16i64_v16i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_1024: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: add z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %in
%b = trunc <16 x i64> %a to <16 x i32>
%c = add <16 x i32> %b, %b
@@ -353,12 +498,16 @@ define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v32i64_v32i32(<32 x i64>* %in, <32 x i32>* %out) #0 {
+define void @trunc_v32i64_v32i32(<32 x i64>* %in, <32 x i32>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v32i64_v32i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_2048: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: add z0.s, z0.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %in
%b = trunc <32 x i64> %a to <32 x i32>
%c = add <32 x i32> %b, %b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
index 61156422b46b..fff1e502becb 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
@@ -1,26 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE for 64-bit vectors
-define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #7
@@ -30,7 +16,7 @@ define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
}
; Don't use SVE for 128-bit vectors
-define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
+define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #15
@@ -40,7 +26,7 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
ret <16 x i8> %ret
}
-define void @shuffle_ext_byone_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @shuffle_ext_byone_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
@@ -62,7 +48,6 @@ define void @shuffle_ext_byone_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @shuffle_ext_byone_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #32
@@ -104,47 +89,18 @@ define void @shuffle_ext_byone_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v128i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w8, #64
-; VBITS_GE_256-NEXT: mov w10, #32
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: mov w9, #96
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1, x8]
-; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x10]
-; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9]
-; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1, x9]
-; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x1]
-; VBITS_GE_256-NEXT: mov z5.b, z0.b[31]
-; VBITS_GE_256-NEXT: fmov w11, s5
-; VBITS_GE_256-NEXT: mov z5.b, z2.b[31]
-; VBITS_GE_256-NEXT: mov z1.b, z1.b[31]
-; VBITS_GE_256-NEXT: fmov w12, s5
-; VBITS_GE_256-NEXT: mov z5.b, z4.b[31]
-; VBITS_GE_256-NEXT: fmov w13, s1
-; VBITS_GE_256-NEXT: fmov w14, s5
-; VBITS_GE_256-NEXT: insr z3.b, w11
-; VBITS_GE_256-NEXT: insr z0.b, w12
-; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x9]
-; VBITS_GE_256-NEXT: insr z4.b, w13
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT: insr z2.b, w14
-; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0]
-; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v128i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT: mov w8, #127
-; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: whilels p1.b, xzr, x8
-; VBITS_GE_1024-NEXT: lastb w8, p1, z0.b
-; VBITS_GE_1024-NEXT: insr z1.b, w8
-; VBITS_GE_1024-NEXT: st1b { z1.b }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @shuffle_ext_byone_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v128i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl128
+; CHECK-NEXT: mov w8, #127
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.b, xzr, x8
+; CHECK-NEXT: lastb w8, p1, z0.b
+; CHECK-NEXT: insr z1.b, w8
+; CHECK-NEXT: st1b { z1.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%ret = shufflevector <128 x i8> %op1, <128 x i8> %op2, <128 x i32> <i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134,
@@ -167,71 +123,18 @@ define void @shuffle_ext_byone_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v256i8:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w8, #32
-; VBITS_GE_256-NEXT: ptrue p0.b, vl32
-; VBITS_GE_256-NEXT: mov w11, #128
-; VBITS_GE_256-NEXT: mov w13, #64
-; VBITS_GE_256-NEXT: mov w12, #96
-; VBITS_GE_256-NEXT: mov w14, #160
-; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1, x8]
-; VBITS_GE_256-NEXT: mov w10, #192
-; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1, x11]
-; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x1, x13]
-; VBITS_GE_256-NEXT: mov w9, #224
-; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x1, x12]
-; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x1, x10]
-; VBITS_GE_256-NEXT: mov z6.b, z0.b[31]
-; VBITS_GE_256-NEXT: fmov w15, s6
-; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x1, x14]
-; VBITS_GE_256-NEXT: mov z16.b, z3.b[31]
-; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9]
-; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x9]
-; VBITS_GE_256-NEXT: ld1b { z17.b }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fmov w16, s16
-; VBITS_GE_256-NEXT: mov z16.b, z5.b[31]
-; VBITS_GE_256-NEXT: insr z5.b, w15
-; VBITS_GE_256-NEXT: fmov w15, s16
-; VBITS_GE_256-NEXT: mov z16.b, z7.b[31]
-; VBITS_GE_256-NEXT: mov z1.b, z1.b[31]
-; VBITS_GE_256-NEXT: fmov w17, s16
-; VBITS_GE_256-NEXT: mov z16.b, z6.b[31]
-; VBITS_GE_256-NEXT: fmov w18, s16
-; VBITS_GE_256-NEXT: mov z16.b, z4.b[31]
-; VBITS_GE_256-NEXT: insr z7.b, w15
-; VBITS_GE_256-NEXT: fmov w15, s16
-; VBITS_GE_256-NEXT: mov z16.b, z17.b[31]
-; VBITS_GE_256-NEXT: fmov w1, s1
-; VBITS_GE_256-NEXT: fmov w2, s16
-; VBITS_GE_256-NEXT: insr z3.b, w17
-; VBITS_GE_256-NEXT: insr z6.b, w16
-; VBITS_GE_256-NEXT: insr z4.b, w18
-; VBITS_GE_256-NEXT: insr z2.b, w15
-; VBITS_GE_256-NEXT: insr z17.b, w1
-; VBITS_GE_256-NEXT: insr z0.b, w2
-; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x9]
-; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x10]
-; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x11]
-; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0, x12]
-; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x13]
-; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x14]
-; VBITS_GE_256-NEXT: st1b { z17.b }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v256i8:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT: mov w8, #255
-; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: whilels p1.b, xzr, x8
-; VBITS_GE_2048-NEXT: lastb w8, p1, z0.b
-; VBITS_GE_2048-NEXT: insr z1.b, w8
-; VBITS_GE_2048-NEXT: st1b { z1.b }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @shuffle_ext_byone_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v256i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: mov w8, #255
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.b, xzr, x8
+; CHECK-NEXT: lastb w8, p1, z0.b
+; CHECK-NEXT: insr z1.b, w8
+; CHECK-NEXT: st1b { z1.b }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%ret = shufflevector <256 x i8> %op1, <256 x i8> %op2, <256 x i32> <i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262,
@@ -271,7 +174,7 @@ define void @shuffle_ext_byone_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors
-define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #6
@@ -281,7 +184,7 @@ define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
}
; Don't use SVE for 128-bit vectors
-define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #14
@@ -290,7 +193,7 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
ret <8 x i16> %ret
}
-define void @shuffle_ext_byone_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @shuffle_ext_byone_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -310,7 +213,6 @@ define void @shuffle_ext_byone_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @shuffle_ext_byone_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
@@ -348,47 +250,18 @@ define void @shuffle_ext_byone_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #32
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: mov z5.h, z0.h[15]
-; VBITS_GE_256-NEXT: fmov w11, s5
-; VBITS_GE_256-NEXT: mov z5.h, z2.h[15]
-; VBITS_GE_256-NEXT: mov z1.h, z1.h[15]
-; VBITS_GE_256-NEXT: fmov w12, s5
-; VBITS_GE_256-NEXT: mov z5.h, z4.h[15]
-; VBITS_GE_256-NEXT: fmov w13, s1
-; VBITS_GE_256-NEXT: fmov w14, s5
-; VBITS_GE_256-NEXT: insr z3.h, w11
-; VBITS_GE_256-NEXT: insr z0.h, w12
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: insr z4.h, w13
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: insr z2.h, w14
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v64i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: mov w8, #63
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: whilels p1.h, xzr, x8
-; VBITS_GE_1024-NEXT: lastb w8, p1, z0.h
-; VBITS_GE_1024-NEXT: insr z1.h, w8
-; VBITS_GE_1024-NEXT: st1h { z1.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @shuffle_ext_byone_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v64i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: mov w8, #63
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.h, xzr, x8
+; CHECK-NEXT: lastb w8, p1, z0.h
+; CHECK-NEXT: insr z1.h, w8
+; CHECK-NEXT: st1h { z1.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%ret = shufflevector <64 x i16> %op1, <64 x i16> %op2, <64 x i32> <i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70,
@@ -403,71 +276,18 @@ define void @shuffle_ext_byone_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v128i16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x10, #64
-; VBITS_GE_256-NEXT: mov x13, #32
-; VBITS_GE_256-NEXT: mov x14, #48
-; VBITS_GE_256-NEXT: mov x11, #80
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: mov x12, #96
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT: mov x9, #112
-; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: mov z6.h, z0.h[15]
-; VBITS_GE_256-NEXT: fmov w15, s6
-; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT: mov z16.h, z2.h[15]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fmov w16, s16
-; VBITS_GE_256-NEXT: mov z16.h, z5.h[15]
-; VBITS_GE_256-NEXT: insr z5.h, w15
-; VBITS_GE_256-NEXT: fmov w15, s16
-; VBITS_GE_256-NEXT: mov z16.h, z7.h[15]
-; VBITS_GE_256-NEXT: mov z1.h, z1.h[15]
-; VBITS_GE_256-NEXT: fmov w17, s16
-; VBITS_GE_256-NEXT: mov z16.h, z6.h[15]
-; VBITS_GE_256-NEXT: fmov w18, s16
-; VBITS_GE_256-NEXT: mov z16.h, z4.h[15]
-; VBITS_GE_256-NEXT: insr z7.h, w15
-; VBITS_GE_256-NEXT: fmov w15, s16
-; VBITS_GE_256-NEXT: mov z16.h, z17.h[15]
-; VBITS_GE_256-NEXT: fmov w1, s1
-; VBITS_GE_256-NEXT: fmov w2, s16
-; VBITS_GE_256-NEXT: insr z2.h, w17
-; VBITS_GE_256-NEXT: insr z6.h, w16
-; VBITS_GE_256-NEXT: insr z4.h, w18
-; VBITS_GE_256-NEXT: insr z3.h, w15
-; VBITS_GE_256-NEXT: insr z17.h, w1
-; VBITS_GE_256-NEXT: insr z0.h, w2
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z17.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v128i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: mov w8, #127
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: whilels p1.h, xzr, x8
-; VBITS_GE_2048-NEXT: lastb w8, p1, z0.h
-; VBITS_GE_2048-NEXT: insr z1.h, w8
-; VBITS_GE_2048-NEXT: st1h { z1.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @shuffle_ext_byone_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v128i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: mov w8, #127
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.h, xzr, x8
+; CHECK-NEXT: lastb w8, p1, z0.h
+; CHECK-NEXT: insr z1.h, w8
+; CHECK-NEXT: st1h { z1.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%ret = shufflevector <128 x i16> %op1, <128 x i16> %op2, <128 x i32> <i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134,
@@ -491,7 +311,7 @@ define void @shuffle_ext_byone_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors
-define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4
@@ -501,7 +321,7 @@ define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
}
; Don't use SVE for 128-bit vectors
-define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12
@@ -510,7 +330,7 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
ret <4 x i32> %ret
}
-define void @shuffle_ext_byone_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @shuffle_ext_byone_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -529,7 +349,6 @@ define void @shuffle_ext_byone_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @shuffle_ext_byone_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: shuffle_ext_byone_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
@@ -565,47 +384,18 @@ define void @shuffle_ext_byone_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: mov z5.s, z0.s[7]
-; VBITS_GE_256-NEXT: fmov w11, s5
-; VBITS_GE_256-NEXT: mov z5.s, z2.s[7]
-; VBITS_GE_256-NEXT: mov z1.s, z1.s[7]
-; VBITS_GE_256-NEXT: fmov w12, s5
-; VBITS_GE_256-NEXT: mov z5.s, z4.s[7]
-; VBITS_GE_256-NEXT: fmov w13, s1
-; VBITS_GE_256-NEXT: fmov w14, s5
-; VBITS_GE_256-NEXT: insr z3.s, w11
-; VBITS_GE_256-NEXT: insr z0.s, w12
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: insr z4.s, w13
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: insr z2.s, w14
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v32i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: mov w8, #31
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: whilels p1.s, xzr, x8
-; VBITS_GE_1024-NEXT: lastb w8, p1, z0.s
-; VBITS_GE_1024-NEXT: insr z1.s, w8
-; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @shuffle_ext_byone_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: mov w8, #31
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.s, xzr, x8
+; CHECK-NEXT: lastb w8, p1, z0.s
+; CHECK-NEXT: insr z1.s, w8
+; CHECK-NEXT: st1w { z1.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%ret = shufflevector <32 x i32> %op1, <32 x i32> %op2, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,
@@ -616,71 +406,18 @@ define void @shuffle_ext_byone_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x10, #32
-; VBITS_GE_256-NEXT: mov x13, #16
-; VBITS_GE_256-NEXT: mov x14, #24
-; VBITS_GE_256-NEXT: mov x11, #40
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: mov x12, #48
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT: mov x9, #56
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: mov z6.s, z0.s[7]
-; VBITS_GE_256-NEXT: fmov w15, s6
-; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: mov z16.s, z2.s[7]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fmov w16, s16
-; VBITS_GE_256-NEXT: mov z16.s, z5.s[7]
-; VBITS_GE_256-NEXT: insr z5.s, w15
-; VBITS_GE_256-NEXT: fmov w15, s16
-; VBITS_GE_256-NEXT: mov z16.s, z7.s[7]
-; VBITS_GE_256-NEXT: mov z1.s, z1.s[7]
-; VBITS_GE_256-NEXT: fmov w17, s16
-; VBITS_GE_256-NEXT: mov z16.s, z6.s[7]
-; VBITS_GE_256-NEXT: fmov w18, s16
-; VBITS_GE_256-NEXT: mov z16.s, z4.s[7]
-; VBITS_GE_256-NEXT: insr z7.s, w15
-; VBITS_GE_256-NEXT: fmov w15, s16
-; VBITS_GE_256-NEXT: mov z16.s, z17.s[7]
-; VBITS_GE_256-NEXT: fmov w1, s1
-; VBITS_GE_256-NEXT: fmov w2, s16
-; VBITS_GE_256-NEXT: insr z2.s, w17
-; VBITS_GE_256-NEXT: insr z6.s, w16
-; VBITS_GE_256-NEXT: insr z4.s, w18
-; VBITS_GE_256-NEXT: insr z3.s, w15
-; VBITS_GE_256-NEXT: insr z17.s, w1
-; VBITS_GE_256-NEXT: insr z0.s, w2
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z17.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v64i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: mov w8, #63
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: whilels p1.s, xzr, x8
-; VBITS_GE_2048-NEXT: lastb w8, p1, z0.s
-; VBITS_GE_2048-NEXT: insr z1.s, w8
-; VBITS_GE_2048-NEXT: st1w { z1.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @shuffle_ext_byone_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v64i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: mov w8, #63
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.s, xzr, x8
+; CHECK-NEXT: lastb w8, p1, z0.s
+; CHECK-NEXT: insr z1.s, w8
+; CHECK-NEXT: st1w { z1.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%ret = shufflevector <64 x i32> %op1, <64 x i32> %op2, <64 x i32> <i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70,
@@ -696,7 +433,7 @@ define void @shuffle_ext_byone_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 128-bit vectors
-define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8
@@ -705,7 +442,7 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
ret <2 x i64> %ret
}
-define void @shuffle_ext_byone_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @shuffle_ext_byone_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -724,7 +461,6 @@ define void @shuffle_ext_byone_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @shuffle_ext_byone_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: shuffle_ext_byone_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
@@ -759,47 +495,18 @@ define void @shuffle_ext_byone_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v16i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: mov z5.d, z0.d[3]
-; VBITS_GE_256-NEXT: fmov x11, d5
-; VBITS_GE_256-NEXT: mov z5.d, z2.d[3]
-; VBITS_GE_256-NEXT: mov z1.d, z1.d[3]
-; VBITS_GE_256-NEXT: fmov x12, d5
-; VBITS_GE_256-NEXT: mov z5.d, z4.d[3]
-; VBITS_GE_256-NEXT: fmov x13, d1
-; VBITS_GE_256-NEXT: fmov x14, d5
-; VBITS_GE_256-NEXT: insr z3.d, x11
-; VBITS_GE_256-NEXT: insr z0.d, x12
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: insr z4.d, x13
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: insr z2.d, x14
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v16i64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: mov w8, #15
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: whilels p1.d, xzr, x8
-; VBITS_GE_1024-NEXT: lastb x8, p1, z0.d
-; VBITS_GE_1024-NEXT: insr z1.d, x8
-; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @shuffle_ext_byone_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: mov w8, #15
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.d, xzr, x8
+; CHECK-NEXT: lastb x8, p1, z0.d
+; CHECK-NEXT: insr z1.d, x8
+; CHECK-NEXT: st1d { z1.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%ret = shufflevector <16 x i64> %op1, <16 x i64> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
@@ -808,71 +515,18 @@ define void @shuffle_ext_byone_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x13, #8
-; VBITS_GE_256-NEXT: mov x14, #12
-; VBITS_GE_256-NEXT: mov x11, #20
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: mov x12, #24
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: mov x9, #28
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: mov z6.d, z0.d[3]
-; VBITS_GE_256-NEXT: fmov x15, d6
-; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: mov z16.d, z2.d[3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: fmov x16, d16
-; VBITS_GE_256-NEXT: mov z16.d, z5.d[3]
-; VBITS_GE_256-NEXT: insr z5.d, x15
-; VBITS_GE_256-NEXT: fmov x15, d16
-; VBITS_GE_256-NEXT: mov z16.d, z7.d[3]
-; VBITS_GE_256-NEXT: mov z1.d, z1.d[3]
-; VBITS_GE_256-NEXT: fmov x17, d16
-; VBITS_GE_256-NEXT: mov z16.d, z6.d[3]
-; VBITS_GE_256-NEXT: fmov x18, d16
-; VBITS_GE_256-NEXT: mov z16.d, z4.d[3]
-; VBITS_GE_256-NEXT: insr z7.d, x15
-; VBITS_GE_256-NEXT: fmov x15, d16
-; VBITS_GE_256-NEXT: mov z16.d, z17.d[3]
-; VBITS_GE_256-NEXT: fmov x1, d1
-; VBITS_GE_256-NEXT: fmov x2, d16
-; VBITS_GE_256-NEXT: insr z2.d, x17
-; VBITS_GE_256-NEXT: insr z6.d, x16
-; VBITS_GE_256-NEXT: insr z4.d, x18
-; VBITS_GE_256-NEXT: insr z3.d, x15
-; VBITS_GE_256-NEXT: insr z17.d, x1
-; VBITS_GE_256-NEXT: insr z0.d, x2
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z17.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v32i64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: mov w8, #31
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: whilels p1.d, xzr, x8
-; VBITS_GE_2048-NEXT: lastb x8, p1, z0.d
-; VBITS_GE_2048-NEXT: insr z1.d, x8
-; VBITS_GE_2048-NEXT: st1d { z1.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @shuffle_ext_byone_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v32i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: mov w8, #31
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.d, xzr, x8
+; CHECK-NEXT: lastb x8, p1, z0.d
+; CHECK-NEXT: insr z1.d, x8
+; CHECK-NEXT: st1d { z1.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%ret = shufflevector <32 x i64> %op1, <32 x i64> %op2, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,
@@ -884,7 +538,7 @@ define void @shuffle_ext_byone_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
}
; Don't use SVE for 64-bit vectors
-define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #6
@@ -894,7 +548,7 @@ define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) #0
}
; Don't use SVE for 128-bit vectors
-define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #14
@@ -903,7 +557,7 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) #0
ret <8 x half> %ret
}
-define void @shuffle_ext_byone_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @shuffle_ext_byone_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@@ -922,7 +576,6 @@ define void @shuffle_ext_byone_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
}
define void @shuffle_ext_byone_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
@@ -957,43 +610,18 @@ define void @shuffle_ext_byone_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v64f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #32
-; VBITS_GE_256-NEXT: mov x9, #48
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: mov z5.h, z0.h[15]
-; VBITS_GE_256-NEXT: insr z1.h, h5
-; VBITS_GE_256-NEXT: mov z5.h, z3.h[15]
-; VBITS_GE_256-NEXT: mov z2.h, z2.h[15]
-; VBITS_GE_256-NEXT: insr z0.h, h5
-; VBITS_GE_256-NEXT: mov z5.h, z4.h[15]
-; VBITS_GE_256-NEXT: insr z4.h, h2
-; VBITS_GE_256-NEXT: insr z3.h, h5
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v64f16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: mov w8, #63
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: whilels p1.h, xzr, x8
-; VBITS_GE_1024-NEXT: lastb h0, p1, z0.h
-; VBITS_GE_1024-NEXT: insr z1.h, h0
-; VBITS_GE_1024-NEXT: st1h { z1.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @shuffle_ext_byone_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v64f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: mov w8, #63
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.h, xzr, x8
+; CHECK-NEXT: lastb h0, p1, z0.h
+; CHECK-NEXT: insr z1.h, h0
+; CHECK-NEXT: st1h { z1.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%ret = shufflevector <64 x half> %op1, <64 x half> %op2, <64 x i32> <i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70,
@@ -1008,63 +636,18 @@ define void @shuffle_ext_byone_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v128f16:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x10, #64
-; VBITS_GE_256-NEXT: mov x9, #80
-; VBITS_GE_256-NEXT: mov x11, #16
-; VBITS_GE_256-NEXT: ptrue p0.h, vl16
-; VBITS_GE_256-NEXT: mov x12, #32
-; VBITS_GE_256-NEXT: mov x13, #48
-; VBITS_GE_256-NEXT: mov x8, #112
-; VBITS_GE_256-NEXT: mov x14, #96
-; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: mov z18.h, z3.h[15]
-; VBITS_GE_256-NEXT: mov z6.h, z1.h[15]
-; VBITS_GE_256-NEXT: insr z1.h, h18
-; VBITS_GE_256-NEXT: mov z18.h, z5.h[15]
-; VBITS_GE_256-NEXT: mov z19.h, z4.h[15]
-; VBITS_GE_256-NEXT: insr z4.h, h18
-; VBITS_GE_256-NEXT: mov z18.h, z16.h[15]
-; VBITS_GE_256-NEXT: insr z3.h, h18
-; VBITS_GE_256-NEXT: mov z18.h, z7.h[15]
-; VBITS_GE_256-NEXT: insr z7.h, h6
-; VBITS_GE_256-NEXT: mov z0.h, z0.h[15]
-; VBITS_GE_256-NEXT: mov z6.h, z17.h[15]
-; VBITS_GE_256-NEXT: insr z16.h, h19
-; VBITS_GE_256-NEXT: insr z2.h, h18
-; VBITS_GE_256-NEXT: insr z17.h, h0
-; VBITS_GE_256-NEXT: insr z5.h, h6
-; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z16.h }, p0, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT: st1h { z17.h }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v128f16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: mov w8, #127
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: whilels p1.h, xzr, x8
-; VBITS_GE_2048-NEXT: lastb h0, p1, z0.h
-; VBITS_GE_2048-NEXT: insr z1.h, h0
-; VBITS_GE_2048-NEXT: st1h { z1.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @shuffle_ext_byone_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v128f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl128
+; CHECK-NEXT: mov w8, #127
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.h, xzr, x8
+; CHECK-NEXT: lastb h0, p1, z0.h
+; CHECK-NEXT: insr z1.h, h0
+; CHECK-NEXT: st1h { z1.h }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%ret = shufflevector <128 x half> %op1, <128 x half> %op2, <128 x i32> <i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134,
@@ -1088,7 +671,7 @@ define void @shuffle_ext_byone_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors
-define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4
@@ -1098,7 +681,7 @@ define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2)
}
; Don't use SVE for 128-bit vectors
-define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12
@@ -1107,7 +690,7 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2)
ret <4 x float> %ret
}
-define void @shuffle_ext_byone_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @shuffle_ext_byone_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@@ -1125,7 +708,6 @@ define void @shuffle_ext_byone_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
}
define void @shuffle_ext_byone_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: shuffle_ext_byone_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
@@ -1158,43 +740,18 @@ define void @shuffle_ext_byone_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
-; VBITS_GE_256-NEXT: mov x9, #24
-; VBITS_GE_256-NEXT: mov x10, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: mov z5.s, z0.s[7]
-; VBITS_GE_256-NEXT: insr z1.s, s5
-; VBITS_GE_256-NEXT: mov z5.s, z3.s[7]
-; VBITS_GE_256-NEXT: mov z2.s, z2.s[7]
-; VBITS_GE_256-NEXT: insr z0.s, s5
-; VBITS_GE_256-NEXT: mov z5.s, z4.s[7]
-; VBITS_GE_256-NEXT: insr z4.s, s2
-; VBITS_GE_256-NEXT: insr z3.s, s5
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v32f32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: mov w8, #31
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: whilels p1.s, xzr, x8
-; VBITS_GE_1024-NEXT: lastb s0, p1, z0.s
-; VBITS_GE_1024-NEXT: insr z1.s, s0
-; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @shuffle_ext_byone_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v32f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl32
+; CHECK-NEXT: mov w8, #31
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.s, xzr, x8
+; CHECK-NEXT: lastb s0, p1, z0.s
+; CHECK-NEXT: insr z1.s, s0
+; CHECK-NEXT: st1w { z1.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%ret = shufflevector <32 x float> %op1, <32 x float> %op2, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,
@@ -1205,63 +762,18 @@ define void @shuffle_ext_byone_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v64f32:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x10, #32
-; VBITS_GE_256-NEXT: mov x9, #40
-; VBITS_GE_256-NEXT: mov x11, #8
-; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: mov x12, #16
-; VBITS_GE_256-NEXT: mov x13, #24
-; VBITS_GE_256-NEXT: mov x8, #56
-; VBITS_GE_256-NEXT: mov x14, #48
-; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT: mov z18.s, z3.s[7]
-; VBITS_GE_256-NEXT: mov z6.s, z1.s[7]
-; VBITS_GE_256-NEXT: insr z1.s, s18
-; VBITS_GE_256-NEXT: mov z18.s, z5.s[7]
-; VBITS_GE_256-NEXT: mov z19.s, z4.s[7]
-; VBITS_GE_256-NEXT: insr z4.s, s18
-; VBITS_GE_256-NEXT: mov z18.s, z16.s[7]
-; VBITS_GE_256-NEXT: insr z3.s, s18
-; VBITS_GE_256-NEXT: mov z18.s, z7.s[7]
-; VBITS_GE_256-NEXT: insr z7.s, s6
-; VBITS_GE_256-NEXT: mov z0.s, z0.s[7]
-; VBITS_GE_256-NEXT: mov z6.s, z17.s[7]
-; VBITS_GE_256-NEXT: insr z16.s, s19
-; VBITS_GE_256-NEXT: insr z2.s, s18
-; VBITS_GE_256-NEXT: insr z17.s, s0
-; VBITS_GE_256-NEXT: insr z5.s, s6
-; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z16.s }, p0, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT: st1w { z17.s }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v64f32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: mov w8, #63
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: whilels p1.s, xzr, x8
-; VBITS_GE_2048-NEXT: lastb s0, p1, z0.s
-; VBITS_GE_2048-NEXT: insr z1.s, s0
-; VBITS_GE_2048-NEXT: st1w { z1.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @shuffle_ext_byone_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v64f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl64
+; CHECK-NEXT: mov w8, #63
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.s, xzr, x8
+; CHECK-NEXT: lastb s0, p1, z0.s
+; CHECK-NEXT: insr z1.s, s0
+; CHECK-NEXT: st1w { z1.s }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%ret = shufflevector <64 x float> %op1, <64 x float> %op2, <64 x i32> <i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70,
@@ -1277,7 +789,7 @@ define void @shuffle_ext_byone_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 128-bit vectors
-define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8
@@ -1286,7 +798,7 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op
ret <2 x double> %ret
}
-define void @shuffle_ext_byone_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @shuffle_ext_byone_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -1304,7 +816,6 @@ define void @shuffle_ext_byone_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
}
define void @shuffle_ext_byone_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: shuffle_ext_byone_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
@@ -1336,43 +847,18 @@ define void @shuffle_ext_byone_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v16f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
-; VBITS_GE_256-NEXT: mov x9, #12
-; VBITS_GE_256-NEXT: mov x10, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: mov z5.d, z0.d[3]
-; VBITS_GE_256-NEXT: insr z1.d, d5
-; VBITS_GE_256-NEXT: mov z5.d, z3.d[3]
-; VBITS_GE_256-NEXT: mov z2.d, z2.d[3]
-; VBITS_GE_256-NEXT: insr z0.d, d5
-; VBITS_GE_256-NEXT: mov z5.d, z4.d[3]
-; VBITS_GE_256-NEXT: insr z4.d, d2
-; VBITS_GE_256-NEXT: insr z3.d, d5
-; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v16f64:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: mov w8, #15
-; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: whilels p1.d, xzr, x8
-; VBITS_GE_1024-NEXT: lastb d0, p1, z0.d
-; VBITS_GE_1024-NEXT: insr z1.d, d0
-; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
+define void @shuffle_ext_byone_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v16f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl16
+; CHECK-NEXT: mov w8, #15
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.d, xzr, x8
+; CHECK-NEXT: lastb d0, p1, z0.d
+; CHECK-NEXT: insr z1.d, d0
+; CHECK-NEXT: st1d { z1.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%ret = shufflevector <16 x double> %op1, <16 x double> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
@@ -1381,63 +867,18 @@ define void @shuffle_ext_byone_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f64:
-; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x10, #16
-; VBITS_GE_256-NEXT: mov x9, #20
-; VBITS_GE_256-NEXT: mov x11, #4
-; VBITS_GE_256-NEXT: ptrue p0.d, vl4
-; VBITS_GE_256-NEXT: mov x12, #8
-; VBITS_GE_256-NEXT: mov x13, #12
-; VBITS_GE_256-NEXT: mov x8, #28
-; VBITS_GE_256-NEXT: mov x14, #24
-; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT: mov z18.d, z3.d[3]
-; VBITS_GE_256-NEXT: mov z6.d, z1.d[3]
-; VBITS_GE_256-NEXT: insr z1.d, d18
-; VBITS_GE_256-NEXT: mov z18.d, z5.d[3]
-; VBITS_GE_256-NEXT: mov z19.d, z4.d[3]
-; VBITS_GE_256-NEXT: insr z4.d, d18
-; VBITS_GE_256-NEXT: mov z18.d, z16.d[3]
-; VBITS_GE_256-NEXT: insr z3.d, d18
-; VBITS_GE_256-NEXT: mov z18.d, z7.d[3]
-; VBITS_GE_256-NEXT: insr z7.d, d6
-; VBITS_GE_256-NEXT: mov z0.d, z0.d[3]
-; VBITS_GE_256-NEXT: mov z6.d, z17.d[3]
-; VBITS_GE_256-NEXT: insr z16.d, d19
-; VBITS_GE_256-NEXT: insr z2.d, d18
-; VBITS_GE_256-NEXT: insr z17.d, d0
-; VBITS_GE_256-NEXT: insr z5.d, d6
-; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z16.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT: st1d { z17.d }, p0, [x0]
-; VBITS_GE_256-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v32f64:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: mov w8, #31
-; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: whilels p1.d, xzr, x8
-; VBITS_GE_2048-NEXT: lastb d0, p1, z0.d
-; VBITS_GE_2048-NEXT: insr z1.d, d0
-; VBITS_GE_2048-NEXT: st1d { z1.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+define void @shuffle_ext_byone_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v32f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl32
+; CHECK-NEXT: mov w8, #31
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT: whilels p1.d, xzr, x8
+; CHECK-NEXT: lastb d0, p1, z0.d
+; CHECK-NEXT: insr z1.d, d0
+; CHECK-NEXT: st1d { z1.d }, p0, [x0]
+; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%ret = shufflevector <32 x double> %op1, <32 x double> %op2, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,
@@ -1448,7 +889,7 @@ define void @shuffle_ext_byone_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
ret void
}
-define void @shuffle_ext_byone_reverse(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @shuffle_ext_byone_reverse(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_byone_reverse:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@@ -1465,7 +906,7 @@ define void @shuffle_ext_byone_reverse(<4 x double>* %a, <4 x double>* %b) #0 {
ret void
}
-define void @shuffle_ext_invalid(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @shuffle_ext_invalid(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: shuffle_ext_invalid:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
@@ -1487,11 +928,7 @@ define void @shuffle_ext_invalid(<4 x double>* %a, <4 x double>* %b) #0 {
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: .cfi_restore w30
-; CHECK-NEXT: .cfi_restore w29
; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
@@ -1500,4 +937,4 @@ define void @shuffle_ext_invalid(<4 x double>* %a, <4 x double>* %b) #0 {
ret void
}
-attributes #0 = { "target-features"="+sve" uwtable }
+attributes #0 = { "target-features"="+sve" }
More information about the llvm-commits
mailing list