[llvm] fcd058a - [SVE][CodeGen] Restructure SVE fixed length tests to use update_llc_test_checks.

Paul Walker via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 16 16:32:30 PDT 2022


Author: Paul Walker
Date: 2022-06-17T00:30:56+01:00
New Revision: fcd058acc95c14ae5202a22548c5e40287e593c5

URL: https://github.com/llvm/llvm-project/commit/fcd058acc95c14ae5202a22548c5e40287e593c5
DIFF: https://github.com/llvm/llvm-project/commit/fcd058acc95c14ae5202a22548c5e40287e593c5.diff

LOG: [SVE][CodeGen] Restructure SVE fixed length tests to use update_llc_test_checks.

Most tests have been updated to make use of vscale_range to reduce
the number of RUN lines.  For the remaining RUN lines the check
prefixes have been updated to ensure the original expectation of
the manual CHECK lines is maintained after update_llc_test_checks
is run.

Added: 
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll

Modified: 
    llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll

Removed: 
    llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll


################################################################################
diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
index 201f4e6945ee..f2dc770242a6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
@@ -1,54 +1,42 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; CLZ
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @ctlz_v8i8(<8 x i8> %op) #0 {
+define <8 x i8> @ctlz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v8i8:
-; CHECK: clz v0.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    clz v0.8b, v0.8b
+; CHECK-NEXT:    ret
   %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @ctlz_v16i8(<16 x i8> %op) #0 {
+define <16 x i8> @ctlz_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v16i8:
-; CHECK: clz v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    clz v0.16b, v0.16b
+; CHECK-NEXT:    ret
   %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
 
-define void @ctlz_v32i8(<32 x i8>* %a) #0 {
+define void @ctlz_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    clz z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i8>, <32 x i8>* %a
   %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op)
   store <32 x i8> %res, <32 x i8>* %a
@@ -56,49 +44,53 @@ define void @ctlz_v32i8(<32 x i8>* %a) #0 {
 }
 
 define void @ctlz_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: ctlz_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: ctlz_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    clz z0.b, p0/m, z0.b
+; VBITS_GE_256-NEXT:    clz z1.b, p0/m, z1.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP_LO]].b
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_512-LABEL: ctlz_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    clz z0.b, p0/m, z0.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, <64 x i8>* %a
   %res = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %op)
   store <64 x i8> %res, <64 x i8>* %a
   ret void
 }
 
-define void @ctlz_v128i8(<128 x i8>* %a) #0 {
+define void @ctlz_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: ctlz_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    clz z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %res = call <128 x i8> @llvm.ctlz.v128i8(<128 x i8> %op)
   store <128 x i8> %res, <128 x i8>* %a
   ret void
 }
 
-define void @ctlz_v256i8(<256 x i8>* %a) #0 {
+define void @ctlz_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: ctlz_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    clz z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %res = call <256 x i8> @llvm.ctlz.v256i8(<256 x i8> %op)
   store <256 x i8> %res, <256 x i8>* %a
@@ -106,30 +98,33 @@ define void @ctlz_v256i8(<256 x i8>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @ctlz_v4i16(<4 x i16> %op) #0 {
+define <4 x i16> @ctlz_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v4i16:
-; CHECK: clz v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    clz v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @ctlz_v8i16(<8 x i16> %op) #0 {
+define <8 x i16> @ctlz_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v8i16:
-; CHECK: clz v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    clz v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
 
-define void @ctlz_v16i16(<16 x i16>* %a) #0 {
+define void @ctlz_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    clz z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op)
   store <16 x i16> %res, <16 x i16>* %a
@@ -137,49 +132,53 @@ define void @ctlz_v16i16(<16 x i16>* %a) #0 {
 }
 
 define void @ctlz_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: ctlz_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ctlz_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    clz z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    clz z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: ctlz_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    clz z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %op)
   store <32 x i16> %res, <32 x i16>* %a
   ret void
 }
 
-define void @ctlz_v64i16(<64 x i16>* %a) #0 {
+define void @ctlz_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: ctlz_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    clz z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call <64 x i16> @llvm.ctlz.v64i16(<64 x i16> %op)
   store <64 x i16> %res, <64 x i16>* %a
   ret void
 }
 
-define void @ctlz_v128i16(<128 x i16>* %a) #0 {
+define void @ctlz_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: ctlz_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    clz z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call <128 x i16> @llvm.ctlz.v128i16(<128 x i16> %op)
   store <128 x i16> %res, <128 x i16>* %a
@@ -187,30 +186,33 @@ define void @ctlz_v128i16(<128 x i16>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @ctlz_v2i32(<2 x i32> %op) #0 {
+define <2 x i32> @ctlz_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v2i32:
-; CHECK: clz v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    clz v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @ctlz_v4i32(<4 x i32> %op) #0 {
+define <4 x i32> @ctlz_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v4i32:
-; CHECK: clz v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    clz v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
 
-define void @ctlz_v8i32(<8 x i32>* %a) #0 {
+define void @ctlz_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    clz z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op)
   store <8 x i32> %res, <8 x i32>* %a
@@ -218,80 +220,91 @@ define void @ctlz_v8i32(<8 x i32>* %a) #0 {
 }
 
 define void @ctlz_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: ctlz_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ctlz_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    clz z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    clz z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: ctlz_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    clz z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %op)
   store <16 x i32> %res, <16 x i32>* %a
   ret void
 }
 
-define void @ctlz_v32i32(<32 x i32>* %a) #0 {
+define void @ctlz_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: ctlz_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    clz z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call <32 x i32> @llvm.ctlz.v32i32(<32 x i32> %op)
   store <32 x i32> %res, <32 x i32>* %a
   ret void
 }
 
-define void @ctlz_v64i32(<64 x i32>* %a) #0 {
+define void @ctlz_v64i32(<64 x i32>* %a)  vscale_range(16,0) #0 {
 ; CHECK-LABEL: ctlz_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    clz z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call <64 x i32> @llvm.ctlz.v64i32(<64 x i32> %op)
   store <64 x i32> %res, <64 x i32>* %a
   ret void
 }
 
-define <1 x i64> @ctlz_v1i64(<1 x i64> %op) #0 {
+define <1 x i64> @ctlz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: clz z0.d, [[PG]]/m, z0.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    clz z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
 
-define <2 x i64> @ctlz_v2i64(<2 x i64> %op) #0 {
+define <2 x i64> @ctlz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: clz z0.d, [[PG]]/m, z0.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    clz z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
 
-define void @ctlz_v4i64(<4 x i64>* %a) #0 {
+define void @ctlz_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    clz z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op)
   store <4 x i64> %res, <4 x i64>* %a
@@ -299,49 +312,53 @@ define void @ctlz_v4i64(<4 x i64>* %a) #0 {
 }
 
 define void @ctlz_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: ctlz_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ctlz_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    clz z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    clz z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: ctlz_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    clz z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %op)
   store <8 x i64> %res, <8 x i64>* %a
   ret void
 }
 
-define void @ctlz_v16i64(<16 x i64>* %a) #0 {
+define void @ctlz_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: ctlz_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    clz z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call <16 x i64> @llvm.ctlz.v16i64(<16 x i64> %op)
   store <16 x i64> %res, <16 x i64>* %a
   ret void
 }
 
-define void @ctlz_v32i64(<32 x i64>* %a) #0 {
+define void @ctlz_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: ctlz_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    clz z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call <32 x i64> @llvm.ctlz.v32i64(<32 x i64> %op)
   store <32 x i64> %res, <32 x i64>* %a
@@ -353,30 +370,33 @@ define void @ctlz_v32i64(<32 x i64>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @ctpop_v8i8(<8 x i8> %op) #0 {
+define <8 x i8> @ctpop_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctpop_v8i8:
-; CHECK: cnt v0.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    ret
   %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @ctpop_v16i8(<16 x i8> %op) #0 {
+define <16 x i8> @ctpop_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctpop_v16i8:
-; CHECK: cnt v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    ret
   %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
 
-define void @ctpop_v32i8(<32 x i8>* %a) #0 {
+define void @ctpop_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctpop_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: cnt [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i8>, <32 x i8>* %a
   %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op)
   store <32 x i8> %res, <32 x i8>* %a
@@ -384,49 +404,53 @@ define void @ctpop_v32i8(<32 x i8>* %a) #0 {
 }
 
 define void @ctpop_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: ctpop_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: cnt [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: ctpop_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    cnt z0.b, p0/m, z0.b
+; VBITS_GE_256-NEXT:    cnt z1.b, p0/m, z1.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: cnt [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP_LO]].b
-; VBITS_EQ_256-DAG: cnt [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_512-LABEL: ctpop_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    cnt z0.b, p0/m, z0.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, <64 x i8>* %a
   %res = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %op)
   store <64 x i8> %res, <64 x i8>* %a
   ret void
 }
 
-define void @ctpop_v128i8(<128 x i8>* %a) #0 {
+define void @ctpop_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: ctpop_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: cnt [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %res = call <128 x i8> @llvm.ctpop.v128i8(<128 x i8> %op)
   store <128 x i8> %res, <128 x i8>* %a
   ret void
 }
 
-define void @ctpop_v256i8(<256 x i8>* %a) #0 {
+define void @ctpop_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: ctpop_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: cnt [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %res = call <256 x i8> @llvm.ctpop.v256i8(<256 x i8> %op)
   store <256 x i8> %res, <256 x i8>* %a
@@ -434,32 +458,35 @@ define void @ctpop_v256i8(<256 x i8>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @ctpop_v4i16(<4 x i16> %op) #0 {
+define <4 x i16> @ctpop_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctpop_v4i16:
-; CHECK: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-NEXT:    ret
   %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @ctpop_v8i16(<8 x i16> %op) #0 {
+define <8 x i16> @ctpop_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctpop_v8i16:
-; CHECK: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    ret
   %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
 
-define void @ctpop_v16i16(<16 x i16>* %a) #0 {
+define void @ctpop_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctpop_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: cnt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op)
   store <16 x i16> %res, <16 x i16>* %a
@@ -467,49 +494,53 @@ define void @ctpop_v16i16(<16 x i16>* %a) #0 {
 }
 
 define void @ctpop_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: ctpop_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: cnt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: cnt [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: cnt [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ctpop_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    cnt z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    cnt z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: ctpop_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    cnt z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %op)
   store <32 x i16> %res, <32 x i16>* %a
   ret void
 }
 
-define void @ctpop_v64i16(<64 x i16>* %a) #0 {
+define void @ctpop_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: ctpop_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: cnt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call <64 x i16> @llvm.ctpop.v64i16(<64 x i16> %op)
   store <64 x i16> %res, <64 x i16>* %a
   ret void
 }
 
-define void @ctpop_v128i16(<128 x i16>* %a) #0 {
+define void @ctpop_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: ctpop_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: cnt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call <128 x i16> @llvm.ctpop.v128i16(<128 x i16> %op)
   store <128 x i16> %res, <128 x i16>* %a
@@ -517,34 +548,37 @@ define void @ctpop_v128i16(<128 x i16>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @ctpop_v2i32(<2 x i32> %op) #0 {
+define <2 x i32> @ctpop_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctpop_v2i32:
-; CHECK: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-NEXT:    uaddlp v0.2s, v0.4h
+; CHECK-NEXT:    ret
   %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @ctpop_v4i32(<4 x i32> %op) #0 {
+define <4 x i32> @ctpop_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctpop_v4i32:
-; CHECK: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: uaddlp v0.4s, v0.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-NEXT:    ret
   %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
 
-define void @ctpop_v8i32(<8 x i32>* %a) #0 {
+define void @ctpop_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctpop_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: cnt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op)
   store <8 x i32> %res, <8 x i32>* %a
@@ -552,49 +586,53 @@ define void @ctpop_v8i32(<8 x i32>* %a) #0 {
 }
 
 define void @ctpop_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: ctpop_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: cnt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: cnt [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: cnt [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ctpop_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    cnt z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    cnt z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: ctpop_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    cnt z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %op)
   store <16 x i32> %res, <16 x i32>* %a
   ret void
 }
 
-define void @ctpop_v32i32(<32 x i32>* %a) #0 {
+define void @ctpop_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: ctpop_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: cnt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call <32 x i32> @llvm.ctpop.v32i32(<32 x i32> %op)
   store <32 x i32> %res, <32 x i32>* %a
   ret void
 }
 
-define void @ctpop_v64i32(<64 x i32>* %a) #0 {
+define void @ctpop_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: ctpop_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: cnt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call <64 x i32> @llvm.ctpop.v64i32(<64 x i32> %op)
   store <64 x i32> %res, <64 x i32>* %a
@@ -602,36 +640,39 @@ define void @ctpop_v64i32(<64 x i32>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @ctpop_v1i64(<1 x i64> %op) #0 {
+define <1 x i64> @ctpop_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctpop_v1i64:
-; CHECK: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-NEXT: uaddlp v0.1d, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-NEXT:    uaddlp v0.2s, v0.4h
+; CHECK-NEXT:    uaddlp v0.1d, v0.2s
+; CHECK-NEXT:    ret
   %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @ctpop_v2i64(<2 x i64> %op) #0 {
+define <2 x i64> @ctpop_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctpop_v2i64:
-; CHECK: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: uaddlp v0.4s, v0.8h
-; CHECK-NEXT: uaddlp v0.2d, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-NEXT:    uaddlp v0.2d, v0.4s
+; CHECK-NEXT:    ret
   %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
 
-define void @ctpop_v4i64(<4 x i64>* %a) #0 {
+define void @ctpop_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctpop_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: cnt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op)
   store <4 x i64> %res, <4 x i64>* %a
@@ -639,49 +680,53 @@ define void @ctpop_v4i64(<4 x i64>* %a) #0 {
 }
 
 define void @ctpop_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: ctpop_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: cnt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: cnt [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: cnt [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ctpop_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    cnt z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    cnt z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: ctpop_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    cnt z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %op)
   store <8 x i64> %res, <8 x i64>* %a
   ret void
 }
 
-define void @ctpop_v16i64(<16 x i64>* %a) #0 {
+define void @ctpop_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: ctpop_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: cnt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call <16 x i64> @llvm.ctpop.v16i64(<16 x i64> %op)
   store <16 x i64> %res, <16 x i64>* %a
   ret void
 }
 
-define void @ctpop_v32i64(<32 x i64>* %a) #0 {
+define void @ctpop_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: ctpop_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: cnt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call <32 x i64> @llvm.ctpop.v32i64(<32 x i64> %op)
   store <32 x i64> %res, <32 x i64>* %a
@@ -692,34 +737,39 @@ define void @ctpop_v32i64(<32 x i64>* %a) #0 {
 ; Count trailing zeros
 ;
 
-define <8 x i8> @cttz_v8i8(<8 x i8> %op) #0 {
+define <8 x i8> @cttz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v8i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
-; CHECK-NEXT: rbit z[[RBIT:[0-9]+]].b, p0/m, z0.b
-; CHECK-NEXT: clz v0.8b, v[[RBIT]].8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
+; CHECK-NEXT:    clz v0.8b, v0.8b
+; CHECK-NEXT:    ret
   %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
 
-define <16 x i8> @cttz_v16i8(<16 x i8> %op) #0 {
+define <16 x i8> @cttz_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v16i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
-; CHECK-NEXT: rbit z[[RBIT:[0-9]+]].b, p0/m, z0.b
-; CHECK-NEXT: clz v0.16b, v[[RBIT]].16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
+; CHECK-NEXT:    clz v0.16b, v0.16b
+; CHECK-NEXT:    ret
   %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
 
-define void @cttz_v32i8(<32 x i8>* %a) #0 {
+define void @cttz_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RBIT:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; CHECK-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[RBIT]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
+; CHECK-NEXT:    clz z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i8>, <32 x i8>* %a
   %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op)
   store <32 x i8> %res, <32 x i8>* %a
@@ -727,88 +777,97 @@ define void @cttz_v32i8(<32 x i8>* %a) #0 {
 }
 
 define void @cttz_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: cttz_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RBIT:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[RBIT]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: cttz_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    rbit z0.b, p0/m, z0.b
+; VBITS_GE_256-NEXT:    rbit z1.b, p0/m, z1.b
+; VBITS_GE_256-NEXT:    clz z0.b, p0/m, z0.b
+; VBITS_GE_256-NEXT:    clz z1.b, p0/m, z1.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: rbit [[RBIT_LO:z[0-9]+]].b, [[PG]]/m, [[OP_LO]].b
-; VBITS_EQ_256-DAG: rbit [[RBIT_HI:z[0-9]+]].b, [[PG]]/m, [[OP_HI]].b
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[RBIT_LO]].b
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[RBIT_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_512-LABEL: cttz_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    rbit z0.b, p0/m, z0.b
+; VBITS_GE_512-NEXT:    clz z0.b, p0/m, z0.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, <64 x i8>* %a
   %res = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %op)
   store <64 x i8> %res, <64 x i8>* %a
   ret void
 }
 
-define void @cttz_v128i8(<128 x i8>* %a) #0 {
+define void @cttz_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: cttz_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RBIT:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[RBIT]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
+; CHECK-NEXT:    clz z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %res = call <128 x i8> @llvm.cttz.v128i8(<128 x i8> %op)
   store <128 x i8> %res, <128 x i8>* %a
   ret void
 }
 
-define void @cttz_v256i8(<256 x i8>* %a) #0 {
+define void @cttz_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: cttz_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RBIT:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].b, [[PG]]/m, [[RBIT]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
+; CHECK-NEXT:    clz z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %res = call <256 x i8> @llvm.cttz.v256i8(<256 x i8> %op)
   store <256 x i8> %res, <256 x i8>* %a
   ret void
 }
 
-define <4 x i16> @cttz_v4i16(<4 x i16> %op) #0 {
+define <4 x i16> @cttz_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v4i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK-NEXT: rbit z[[RBIT:[0-9]+]].h, p0/m, z0.h
-; CHECK-NEXT: clz v0.4h, v[[RBIT]].4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
+; CHECK-NEXT:    clz v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
 
-define <8 x i16> @cttz_v8i16(<8 x i16> %op) #0 {
+define <8 x i16> @cttz_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v8i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK-NEXT: rbit z[[RBIT:[0-9]+]].h, p0/m, z0.h
-; CHECK-NEXT: clz v0.8h, v[[RBIT]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
+; CHECK-NEXT:    clz v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
 
-define void @cttz_v16i16(<16 x i16>* %a) #0 {
+define void @cttz_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RBIT:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[RBIT]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
+; CHECK-NEXT:    clz z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op)
   store <16 x i16> %res, <16 x i16>* %a
@@ -816,54 +875,58 @@ define void @cttz_v16i16(<16 x i16>* %a) #0 {
 }
 
 define void @cttz_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: cttz_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RBIT:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[RBIT]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: rbit [[RBIT_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: rbit [[RBIT_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[RBIT_LO]].h
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[RBIT_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: cttz_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    rbit z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    rbit z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    clz z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    clz z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: cttz_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    rbit z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    clz z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %op)
   store <32 x i16> %res, <32 x i16>* %a
   ret void
 }
 
-define void @cttz_v64i16(<64 x i16>* %a) #0 {
+define void @cttz_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: cttz_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RBIT:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[RBIT]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
+; CHECK-NEXT:    clz z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call <64 x i16> @llvm.cttz.v64i16(<64 x i16> %op)
   store <64 x i16> %res, <64 x i16>* %a
   ret void
 }
 
-define void @cttz_v128i16(<128 x i16>* %a) #0 {
+define void @cttz_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: cttz_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RBIT:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].h, [[PG]]/m, [[RBIT]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
+; CHECK-NEXT:    clz z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call <128 x i16> @llvm.cttz.v128i16(<128 x i16> %op)
   store <128 x i16> %res, <128 x i16>* %a
@@ -871,35 +934,40 @@ define void @cttz_v128i16(<128 x i16>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @cttz_v2i32(<2 x i32> %op) #0 {
+define <2 x i32> @cttz_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK-NEXT: rbit z[[RBIT:[0-9]+]].s, p0/m, z0.s
-; CHECK-NEXT: clz v0.2s, v[[RBIT]].2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
+; CHECK-NEXT:    clz v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @cttz_v4i32(<4 x i32> %op) #0 {
+define <4 x i32> @cttz_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK-NEXT: rbit z[[RBIT:[0-9]+]].s, p0/m, z0.s
-; CHECK-NEXT: clz v0.4s, v[[RBIT]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
+; CHECK-NEXT:    clz v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
 
-define void @cttz_v8i32(<8 x i32>* %a) #0 {
+define void @cttz_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RBIT:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[RBIT]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
+; CHECK-NEXT:    clz z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op)
   store <8 x i32> %res, <8 x i32>* %a
@@ -907,88 +975,99 @@ define void @cttz_v8i32(<8 x i32>* %a) #0 {
 }
 
 define void @cttz_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: cttz_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RBIT:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[RBIT]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: rbit [[RBIT_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: rbit [[RBIT_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[RBIT_LO]].s
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[RBIT_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: cttz_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    rbit z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    rbit z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    clz z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    clz z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: cttz_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    rbit z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    clz z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %op)
   store <16 x i32> %res, <16 x i32>* %a
   ret void
 }
 
-define void @cttz_v32i32(<32 x i32>* %a) #0 {
+define void @cttz_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: cttz_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RBIT:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[RBIT]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
+; CHECK-NEXT:    clz z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call <32 x i32> @llvm.cttz.v32i32(<32 x i32> %op)
   store <32 x i32> %res, <32 x i32>* %a
   ret void
 }
 
-define void @cttz_v64i32(<64 x i32>* %a) #0 {
+define void @cttz_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: cttz_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RBIT:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].s, [[PG]]/m, [[RBIT]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
+; CHECK-NEXT:    clz z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call <64 x i32> @llvm.cttz.v64i32(<64 x i32> %op)
   store <64 x i32> %res, <64 x i32>* %a
   ret void
 }
 
-define <1 x i64> @cttz_v1i64(<1 x i64> %op) #0 {
+define <1 x i64> @cttz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: rbit [[RBIT:z[0-9]+]].d, [[PG]]/m, z0.d
-; CHECK-NEXT: clz z0.d, [[PG]]/m, [[RBIT]].d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
+; CHECK-NEXT:    clz z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
 
-define <2 x i64> @cttz_v2i64(<2 x i64> %op) #0 {
+define <2 x i64> @cttz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: rbit [[RBIT:z[0-9]+]].d, [[PG]]/m, z0.d
-; CHECK-NEXT: clz z0.d, [[PG]]/m, [[RBIT]].d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
+; CHECK-NEXT:    clz z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
 
-define void @cttz_v4i64(<4 x i64>* %a) #0 {
+define void @cttz_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RBIT:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[RBIT]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
+; CHECK-NEXT:    clz z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op)
   store <4 x i64> %res, <4 x i64>* %a
@@ -996,54 +1075,58 @@ define void @cttz_v4i64(<4 x i64>* %a) #0 {
 }
 
 define void @cttz_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: cttz_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RBIT:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[RBIT]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: rbit [[RBIT_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: rbit [[RBIT_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: clz [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[RBIT_LO]].d
-; VBITS_EQ_256-DAG: clz [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[RBIT_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: cttz_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    rbit z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    rbit z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    clz z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    clz z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: cttz_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    rbit z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    clz z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %op)
   store <8 x i64> %res, <8 x i64>* %a
   ret void
 }
 
-define void @cttz_v16i64(<16 x i64>* %a) #0 {
+define void @cttz_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: cttz_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RBIT:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[RBIT]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
+; CHECK-NEXT:    clz z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call <16 x i64> @llvm.cttz.v16i64(<16 x i64> %op)
   store <16 x i64> %res, <16 x i64>* %a
   ret void
 }
 
-define void @cttz_v32i64(<32 x i64>* %a) #0 {
+define void @cttz_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: cttz_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RBIT:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: clz [[RES:z[0-9]+]].d, [[PG]]/m, [[RBIT]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
+; CHECK-NEXT:    clz z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call <32 x i64> @llvm.cttz.v32i64(<32 x i64> %op)
   store <32 x i64> %res, <32 x i64>* %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
index eb4186cd6aef..45008aa7abfd 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
@@ -1,31 +1,17 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ; Don't use SVE for 64-bit vectors.
-define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) #0 {
+define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitcast_v4i16:
-; CHECK: ldr d0, [x0]
-; CHECK-NEXT: str d0, [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    str d0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <4 x i16>, <4 x i16>* %a
   %cast = bitcast <4 x i16> %load to <4 x half>
   store volatile <4 x half> %cast, <4 x half>* %b
@@ -33,23 +19,25 @@ define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define void @bitcast_v8i16(<8 x i16> *%a, <8 x half>* %b) #0 {
+define void @bitcast_v8i16(<8 x i16> *%a, <8 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitcast_v8i16:
-; CHECK: ldr q0, [x0]
-; CHECK-NEXT: str q0, [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <8 x i16>, <8 x i16>* %a
   %cast = bitcast <8 x i16> %load to <8 x half>
   store volatile <8 x half> %cast, <8 x half>* %b
   ret void
 }
 
-define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) #0 {
+define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitcast_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <16 x i16>, <16 x i16>* %a
   %cast = bitcast <16 x i16> %load to <16 x half>
   store volatile <16 x half> %cast, <16 x half>* %b
@@ -57,35 +45,48 @@ define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) #0 {
 }
 
 define void @bitcast_v32i16(<32 x i16> *%a, <32 x half>* %b) #0 {
-; CHECK-LABEL: bitcast_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: bitcast_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: bitcast_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %load = load volatile <32 x i16>, <32 x i16>* %a
   %cast = bitcast <32 x i16> %load to <32 x half>
   store volatile <32 x half> %cast, <32 x half>* %b
   ret void
 }
 
-define void @bitcast_v64i16(<64 x i16> *%a, <64 x half>* %b) #0 {
+define void @bitcast_v64i16(<64 x i16> *%a, <64 x half>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: bitcast_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <64 x i16>, <64 x i16>* %a
   %cast = bitcast <64 x i16> %load to <64 x half>
   store volatile <64 x half> %cast, <64 x half>* %b
   ret void
 }
 
-define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) #0 {
+define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: bitcast_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <128 x i16>, <128 x i16>* %a
   %cast = bitcast <128 x i16> %load to <128 x half>
   store volatile <128 x half> %cast, <128 x half>* %b
@@ -93,11 +94,12 @@ define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) #0 {
+define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitcast_v2i32:
-; CHECK: ldr d0, [x0]
-; CHECK-NEXT: str d0, [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    str d0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <2 x i32>, <2 x i32>* %a
   %cast = bitcast <2 x i32> %load to <2 x float>
   store volatile <2 x float> %cast, <2 x float>* %b
@@ -105,23 +107,25 @@ define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define void @bitcast_v4i32(<4 x i32> *%a, <4 x float>* %b) #0 {
+define void @bitcast_v4i32(<4 x i32> *%a, <4 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitcast_v4i32:
-; CHECK: ldr q0, [x0]
-; CHECK-NEXT: str q0, [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <4 x i32>, <4 x i32>* %a
   %cast = bitcast <4 x i32> %load to <4 x float>
   store volatile <4 x float> %cast, <4 x float>* %b
   ret void
 }
 
-define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) #0 {
+define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitcast_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <8 x i32>, <8 x i32>* %a
   %cast = bitcast <8 x i32> %load to <8 x float>
   store volatile <8 x float> %cast, <8 x float>* %b
@@ -129,35 +133,48 @@ define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) #0 {
 }
 
 define void @bitcast_v16i32(<16 x i32> *%a, <16 x float>* %b) #0 {
-; CHECK-LABEL: bitcast_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: bitcast_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: bitcast_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %load = load volatile <16 x i32>, <16 x i32>* %a
   %cast = bitcast <16 x i32> %load to <16 x float>
   store volatile <16 x float> %cast, <16 x float>* %b
   ret void
 }
 
-define void @bitcast_v32i32(<32 x i32> *%a, <32 x float>* %b) #0 {
+define void @bitcast_v32i32(<32 x i32> *%a, <32 x float>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: bitcast_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <32 x i32>, <32 x i32>* %a
   %cast = bitcast <32 x i32> %load to <32 x float>
   store volatile <32 x float> %cast, <32 x float>* %b
   ret void
 }
 
-define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) #0 {
+define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: bitcast_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <64 x i32>, <64 x i32>* %a
   %cast = bitcast <64 x i32> %load to <64 x float>
   store volatile <64 x float> %cast, <64 x float>* %b
@@ -165,11 +182,12 @@ define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) #0 {
+define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitcast_v1i64:
-; CHECK: ldr d0, [x0]
-; CHECK-NEXT: str d0, [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    str d0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <1 x i64>, <1 x i64>* %a
   %cast = bitcast <1 x i64> %load to <1 x double>
   store volatile <1 x double> %cast, <1 x double>* %b
@@ -177,23 +195,25 @@ define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define void @bitcast_v2i64(<2 x i64> *%a, <2 x double>* %b) #0 {
+define void @bitcast_v2i64(<2 x i64> *%a, <2 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitcast_v2i64:
-; CHECK: ldr q0, [x0]
-; CHECK-NEXT: str q0, [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <2 x i64>, <2 x i64>* %a
   %cast = bitcast <2 x i64> %load to <2 x double>
   store volatile <2 x double> %cast, <2 x double>* %b
   ret void
 }
 
-define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) #0 {
+define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitcast_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <4 x i64>, <4 x i64>* %a
   %cast = bitcast <4 x i64> %load to <4 x double>
   store volatile <4 x double> %cast, <4 x double>* %b
@@ -201,35 +221,48 @@ define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) #0 {
 }
 
 define void @bitcast_v8i64(<8 x i64> *%a, <8 x double>* %b) #0 {
-; CHECK-LABEL: bitcast_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: bitcast_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: bitcast_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %load = load volatile <8 x i64>, <8 x i64>* %a
   %cast = bitcast <8 x i64> %load to <8 x double>
   store volatile <8 x double> %cast, <8 x double>* %b
   ret void
 }
 
-define void @bitcast_v16i64(<16 x i64> *%a, <16 x double>* %b) #0 {
+define void @bitcast_v16i64(<16 x i64> *%a, <16 x double>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: bitcast_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <16 x i64>, <16 x i64>* %a
   %cast = bitcast <16 x i64> %load to <16 x double>
   store volatile <16 x double> %cast, <16 x double>* %b
   ret void
 }
 
-define void @bitcast_v32i64(<32 x i64> *%a, <32 x double>* %b) #0 {
+define void @bitcast_v32i64(<32 x i64> *%a, <32 x double>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: bitcast_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %load = load volatile <32 x i64>, <32 x i64>* %a
   %cast = bitcast <32 x i64> %load to <32 x double>
   store volatile <32 x double> %cast, <32 x double>* %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
index becdc912d1fb..50b0be2601cf 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
@@ -1,57 +1,47 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; i8
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) #0 {
+define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v8i8:
-; CHECK: uzp1 v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = shufflevector <4 x i8> %op1, <4 x i8> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v16i8:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
 
-define void @concat_v32i8(<16 x i8>* %a, <16 x i8>* %b, <32 x i8>* %c) #0 {
+define void @concat_v32i8(<16 x i8>* %a, <16 x i8>* %b, <32 x i8>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v32i8:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].b, vl16
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].b, [[PG1]], z[[OP1]].b, z[[OP2]].b
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].b, vl32
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    splice z1.b, p0, z1.b, z0.b
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    st1b { z1.b }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i8>, <16 x i8>* %a
   %op2 = load <16 x i8>, <16 x i8>* %b
   %res = shufflevector <16 x i8> %op1, <16 x i8> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -63,14 +53,25 @@ define void @concat_v32i8(<16 x i8>* %a, <16 x i8>* %b, <32 x i8>* %c) #0 {
 }
 
 define void @concat_v64i8(<32 x i8>* %a, <32 x i8>* %b, <64 x i8>* %c) #0 {
-; CHECK-LABEL: concat_v64i8:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].b, [[PG1]], [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x2, x8]
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: concat_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    splice z0.b, p0, z0.b, z1.b
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x2]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = shufflevector <32 x i8> %op1, <32 x i8> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -85,15 +86,16 @@ define void @concat_v64i8(<32 x i8>* %a, <32 x i8>* %b, <64 x i8>* %c) #0 {
   ret void
 }
 
-define void @concat_v128i8(<64 x i8>* %a, <64 x i8>* %b, <128 x i8>* %c) #0 {
+define void @concat_v128i8(<64 x i8>* %a, <64 x i8>* %b, <128 x i8>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v128i8:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].b, [[PG1]], [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    st1b { z0.b }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = shufflevector <64 x i8> %op1, <64 x i8> %op2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -116,15 +118,16 @@ define void @concat_v128i8(<64 x i8>* %a, <64 x i8>* %b, <128 x i8>* %c) #0 {
   ret void
 }
 
-define void @concat_v256i8(<128 x i8>* %a, <128 x i8>* %b, <256 x i8>* %c) #0 {
+define void @concat_v256i8(<128 x i8>* %a, <128 x i8>* %b, <256 x i8>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v256i8:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].b, [[PG1]], [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    st1b { z0.b }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = shufflevector <128 x i8> %op1, <128 x i8> %op2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -168,32 +171,37 @@ define void @concat_v256i8(<128 x i8>* %a, <128 x i8>* %b, <256 x i8>* %c) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) #0 {
+define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v4i16:
-; CHECK: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = shufflevector <2 x i16> %op1, <2 x i16> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v8i16:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
   %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
 
-define void @concat_v16i16(<8 x i16>* %a, <8 x i16>* %b, <16 x i16>* %c) #0 {
+define void @concat_v16i16(<8 x i16>* %a, <8 x i16>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v16i16:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], z[[OP1]].h, z[[OP2]].h
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h, vl16
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    st1h { z1.h }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i16>, <8 x i16>* %a
   %op2 = load <8 x i16>, <8 x i16>* %b
   %res = shufflevector <8 x i16> %op1, <8 x i16> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -203,14 +211,25 @@ define void @concat_v16i16(<8 x i16>* %a, <8 x i16>* %b, <16 x i16>* %c) #0 {
 }
 
 define void @concat_v32i16(<16 x i16>* %a, <16 x i16>* %b, <32 x i16>* %c) #0 {
-; CHECK-LABEL: concat_v32i16:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: concat_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    splice z0.h, p0, z0.h, z1.h
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = shufflevector <16 x i16> %op1, <16 x i16> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -221,15 +240,16 @@ define void @concat_v32i16(<16 x i16>* %a, <16 x i16>* %b, <32 x i16>* %c) #0 {
   ret void
 }
 
-define void @concat_v64i16(<32 x i16>* %a, <32 x i16>* %b, <64 x i16>* %c) #0 {
+define void @concat_v64i16(<32 x i16>* %a, <32 x i16>* %b, <64 x i16>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v64i16:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = shufflevector <32 x i16> %op1, <32 x i16> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -244,15 +264,16 @@ define void @concat_v64i16(<32 x i16>* %a, <32 x i16>* %b, <64 x i16>* %c) #0 {
   ret void
 }
 
-define void @concat_v128i16(<64 x i16>* %a, <64 x i16>* %b, <128 x i16>* %c) #0 {
+define void @concat_v128i16(<64 x i16>* %a, <64 x i16>* %b, <128 x i16>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v128i16:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = shufflevector <64 x i16> %op1, <64 x i16> %op2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -280,32 +301,37 @@ define void @concat_v128i16(<64 x i16>* %a, <64 x i16>* %b, <128 x i16>* %c) #0
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2) #0 {
+define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v2i32:
-; CHECK: zip1 v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v4i32:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
   %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-define void @concat_v8i32(<4 x i32>* %a, <4 x i32>* %b, <8 x i32>* %c) #0 {
+define void @concat_v8i32(<4 x i32>* %a, <4 x i32>* %b, <8 x i32>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v8i32:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], z[[OP1]].s, z[[OP2]].s
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    st1w { z1.s }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i32>, <4 x i32>* %a
   %op2 = load <4 x i32>, <4 x i32>* %b
   %res = shufflevector <4 x i32> %op1, <4 x i32> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -314,14 +340,25 @@ define void @concat_v8i32(<4 x i32>* %a, <4 x i32>* %b, <8 x i32>* %c) #0 {
 }
 
 define void @concat_v16i32(<8 x i32>* %a, <8 x i32>* %b, <16 x i32>* %c) #0 {
-; CHECK-LABEL: concat_v16i32:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: concat_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    splice z0.s, p0, z0.s, z1.s
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = shufflevector <8 x i32> %op1, <8 x i32> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -330,15 +367,16 @@ define void @concat_v16i32(<8 x i32>* %a, <8 x i32>* %b, <16 x i32>* %c) #0 {
   ret void
 }
 
-define void @concat_v32i32(<16 x i32>* %a, <16 x i32>* %b, <32 x i32>* %c) #0 {
+define void @concat_v32i32(<16 x i32>* %a, <16 x i32>* %b, <32 x i32>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v32i32:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = shufflevector <16 x i32> %op1, <16 x i32> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -349,15 +387,16 @@ define void @concat_v32i32(<16 x i32>* %a, <16 x i32>* %b, <32 x i32>* %c) #0 {
   ret void
 }
 
-define void @concat_v64i32(<32 x i32>* %a, <32 x i32>* %b, <64 x i32>* %c) #0 {
+define void @concat_v64i32(<32 x i32>* %a, <32 x i32>* %b, <64 x i32>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v64i32:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl32
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = shufflevector <32 x i32> %op1, <32 x i32> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -377,23 +416,27 @@ define void @concat_v64i32(<32 x i32>* %a, <32 x i32>* %b, <64 x i32>* %c) #0 {
 ;
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v2i64:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
   %res = shufflevector <1 x i64> %op1, <1 x i64> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %res
 }
 
-define void @concat_v4i64(<2 x i64>* %a, <2 x i64>* %b, <4 x i64>* %c) #0 {
+define void @concat_v4i64(<2 x i64>* %a, <2 x i64>* %b, <4 x i64>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v4i64:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].d, vl2
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], z[[OP1]].d, z[[OP2]].d
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d, vl4
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    splice z1.d, p0, z1.d, z0.d
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    st1d { z1.d }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <2 x i64>, <2 x i64>* %a
   %op2 = load <2 x i64>, <2 x i64>* %b
   %res = shufflevector <2 x i64> %op1, <2 x i64> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -402,14 +445,25 @@ define void @concat_v4i64(<2 x i64>* %a, <2 x i64>* %b, <4 x i64>* %c) #0 {
 }
 
 define void @concat_v8i64(<4 x i64>* %a, <4 x i64>* %b, <8 x i64>* %c) #0 {
-; CHECK-LABEL: concat_v8i64:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: concat_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    splice z0.d, p0, z0.d, z1.d
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = shufflevector <4 x i64> %op1, <4 x i64> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -417,15 +471,16 @@ define void @concat_v8i64(<4 x i64>* %a, <4 x i64>* %b, <8 x i64>* %c) #0 {
   ret void
 }
 
-define void @concat_v16i64(<8 x i64>* %a, <8 x i64>* %b, <16 x i64>* %c) #0 {
+define void @concat_v16i64(<8 x i64>* %a, <8 x i64>* %b, <16 x i64>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v16i64:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl8
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl8
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = shufflevector <8 x i64> %op1, <8 x i64> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -434,15 +489,16 @@ define void @concat_v16i64(<8 x i64>* %a, <8 x i64>* %b, <16 x i64>* %c) #0 {
   ret void
 }
 
-define void @concat_v32i64(<16 x i64>* %a, <16 x i64>* %b, <32 x i64>* %c) #0 {
+define void @concat_v32i64(<16 x i64>* %a, <16 x i64>* %b, <32 x i64>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v32i64:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl16
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = shufflevector <16 x i64> %op1, <16 x i64> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -458,32 +514,37 @@ define void @concat_v32i64(<16 x i64>* %a, <16 x i64>* %b, <32 x i64>* %c) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) #0 {
+define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v4f16:
-; CHECK: zip1 v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v8f16:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
   %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x half> %res
 }
 
-define void @concat_v16f16(<8 x half>* %a, <8 x half>* %b, <16 x half>* %c) #0 {
+define void @concat_v16f16(<8 x half>* %a, <8 x half>* %b, <16 x half>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v16f16:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], z[[OP1]].h, z[[OP2]].h
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h, vl16
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    st1h { z1.h }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <8 x half>, <8 x half>* %a
   %op2 = load <8 x half>, <8 x half>* %b
   %res = shufflevector <8 x half> %op1, <8 x half> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -493,14 +554,25 @@ define void @concat_v16f16(<8 x half>* %a, <8 x half>* %b, <16 x half>* %c) #0 {
 }
 
 define void @concat_v32f16(<16 x half>* %a, <16 x half>* %b, <32 x half>* %c) #0 {
-; CHECK-LABEL: concat_v32f16:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: concat_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    splice z0.h, p0, z0.h, z1.h
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %op2 = load <16 x half>, <16 x half>* %b
   %res = shufflevector <16 x half> %op1, <16 x half> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -511,15 +583,16 @@ define void @concat_v32f16(<16 x half>* %a, <16 x half>* %b, <32 x half>* %c) #0
   ret void
 }
 
-define void @concat_v64f16(<32 x half>* %a, <32 x half>* %b, <64 x half>* %c) #0 {
+define void @concat_v64f16(<32 x half>* %a, <32 x half>* %b, <64 x half>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v64f16:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %op2 = load <32 x half>, <32 x half>* %b
   %res = shufflevector <32 x half> %op1, <32 x half> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -534,15 +607,16 @@ define void @concat_v64f16(<32 x half>* %a, <32 x half>* %b, <64 x half>* %c) #0
   ret void
 }
 
-define void @concat_v128f16(<64 x half>* %a, <64 x half>* %b, <128 x half>* %c) #0 {
+define void @concat_v128f16(<64 x half>* %a, <64 x half>* %b, <128 x half>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v128f16:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %res = shufflevector <64 x half> %op1, <64 x half> %op2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -570,32 +644,37 @@ define void @concat_v128f16(<64 x half>* %a, <64 x half>* %b, <128 x half>* %c)
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2) #0 {
+define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v2f32:
-; CHECK: zip1 v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v4f32:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
   %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
 
-define void @concat_v8f32(<4 x float>* %a, <4 x float>* %b, <8 x float>* %c) #0 {
+define void @concat_v8f32(<4 x float>* %a, <4 x float>* %b, <8 x float>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v8f32:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], z[[OP1]].s, z[[OP2]].s
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    st1w { z1.s }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <4 x float>, <4 x float>* %a
   %op2 = load <4 x float>, <4 x float>* %b
   %res = shufflevector <4 x float> %op1, <4 x float> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -604,14 +683,25 @@ define void @concat_v8f32(<4 x float>* %a, <4 x float>* %b, <8 x float>* %c) #0
 }
 
 define void @concat_v16f32(<8 x float>* %a, <8 x float>* %b, <16 x float>* %c) #0 {
-; CHECK-LABEL: concat_v16f32:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: concat_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    splice z0.s, p0, z0.s, z1.s
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %op2 = load <8 x float>, <8 x float>* %b
   %res = shufflevector <8 x float> %op1, <8 x float> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -620,15 +710,16 @@ define void @concat_v16f32(<8 x float>* %a, <8 x float>* %b, <16 x float>* %c) #
   ret void
 }
 
-define void @concat_v32f32(<16 x float>* %a, <16 x float>* %b, <32 x float>* %c) #0 {
+define void @concat_v32f32(<16 x float>* %a, <16 x float>* %b, <32 x float>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v32f32:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %op2 = load <16 x float>, <16 x float>* %b
   %res = shufflevector <16 x float> %op1, <16 x float> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -639,15 +730,16 @@ define void @concat_v32f32(<16 x float>* %a, <16 x float>* %b, <32 x float>* %c)
   ret void
 }
 
-define void @concat_v64f32(<32 x float>* %a, <32 x float>* %b, <64 x float>* %c) #0 {
+define void @concat_v64f32(<32 x float>* %a, <32 x float>* %b, <64 x float>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v64f32:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl32
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %res = shufflevector <32 x float> %op1, <32 x float> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -667,23 +759,27 @@ define void @concat_v64f32(<32 x float>* %a, <32 x float>* %b, <64 x float>* %c)
 ;
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v2f64:
-; CHECK: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
   %res = shufflevector <1 x double> %op1, <1 x double> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %res
 }
 
-define void @concat_v4f64(<2 x double>* %a, <2 x double>* %b, <4 x double>* %c) #0 {
+define void @concat_v4f64(<2 x double>* %a, <2 x double>* %b, <4 x double>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v4f64:
-; CHECK: ldr q[[OP2:[0-9]+]], [x1]
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].d, vl2
-; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], z[[OP1]].d, z[[OP2]].d
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d, vl4
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    splice z1.d, p0, z1.d, z0.d
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    st1d { z1.d }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <2 x double>, <2 x double>* %a
   %op2 = load <2 x double>, <2 x double>* %b
   %res = shufflevector <2 x double> %op1, <2 x double> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -692,14 +788,25 @@ define void @concat_v4f64(<2 x double>* %a, <2 x double>* %b, <4 x double>* %c)
 }
 
 define void @concat_v8f64(<4 x double>* %a, <4 x double>* %b, <8 x double>* %c) #0 {
-; CHECK-LABEL: concat_v8f64:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: concat_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: concat_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    splice z0.d, p0, z0.d, z1.d
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %op2 = load <4 x double>, <4 x double>* %b
   %res = shufflevector <4 x double> %op1, <4 x double> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -707,15 +814,16 @@ define void @concat_v8f64(<4 x double>* %a, <4 x double>* %b, <8 x double>* %c)
   ret void
 }
 
-define void @concat_v16f64(<8 x double>* %a, <8 x double>* %b, <16 x double>* %c) #0 {
+define void @concat_v16f64(<8 x double>* %a, <8 x double>* %b, <16 x double>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v16f64:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl8
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl8
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %op2 = load <8 x double>, <8 x double>* %b
   %res = shufflevector <8 x double> %op1, <8 x double> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -724,15 +832,16 @@ define void @concat_v16f64(<8 x double>* %a, <8 x double>* %b, <16 x double>* %c
   ret void
 }
 
-define void @concat_v32f64(<16 x double>* %a, <16 x double>* %b, <32 x double>* %c) #0 {
+define void @concat_v32f64(<16 x double>* %a, <16 x double>* %b, <32 x double>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v32f64:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl16
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG2]], [x2]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %res = shufflevector <16 x double> %op1, <16 x double> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -747,12 +856,13 @@ define void @concat_v32f64(<16 x double>* %a, <16 x double>* %b, <32 x double>*
 ; undef
 ;
 
-define void @concat_v32i8_undef(<16 x i8>* %a, <32 x i8>* %b) #0 {
+define void @concat_v32i8_undef(<16 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v32i8_undef:
-; CHECK: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: st1b { z[[OP1]].b }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i8>, <16 x i8>* %a
   %res = shufflevector <16 x i8> %op1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
@@ -762,12 +872,13 @@ define void @concat_v32i8_undef(<16 x i8>* %a, <32 x i8>* %b) #0 {
   ret void
 }
 
-define void @concat_v16i16_undef(<8 x i16>* %a, <16 x i16>* %b) #0 {
+define void @concat_v16i16_undef(<8 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v16i16_undef:
-; CHECK: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: st1h { z[[OP1]].h }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i16>, <8 x i16>* %a
   %res = shufflevector <8 x i16> %op1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -775,24 +886,26 @@ define void @concat_v16i16_undef(<8 x i16>* %a, <16 x i16>* %b) #0 {
   ret void
 }
 
-define void @concat_v8i32_undef(<4 x i32>* %a, <8 x i32>* %b) #0 {
+define void @concat_v8i32_undef(<4 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v8i32_undef:
-; CHECK: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: st1w { z[[OP1]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i32>, <4 x i32>* %a
   %res = shufflevector <4 x i32> %op1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i32> %res, <8 x i32>* %b
   ret void
 }
 
-define void @concat_v4i64_undef(<2 x i64>* %a, <4 x i64>* %b) #0 {
+define void @concat_v4i64_undef(<2 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v4i64_undef:
-; CHECK: ldr q[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: st1d { z[[OP1]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <2 x i64>, <2 x i64>* %a
   %res = shufflevector <2 x i64> %op1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i64> %res, <4 x i64>* %b
@@ -803,12 +916,13 @@ define void @concat_v4i64_undef(<2 x i64>* %a, <4 x i64>* %b) #0 {
 ; > 2 operands
 ;
 
-define void @concat_v32i8_4op(<8 x i8>* %a, <32 x i8>* %b) #0 {
+define void @concat_v32i8_4op(<8 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v32i8_4op:
-; CHECK: ldr d[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: st1b { z[[OP1]].b }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i8>, <8 x i8>* %a
   %shuffle = shufflevector <8 x i8> %op1, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -820,12 +934,13 @@ define void @concat_v32i8_4op(<8 x i8>* %a, <32 x i8>* %b) #0 {
   ret void
 }
 
-define void @concat_v16i16_4op(<4 x i16>* %a, <16 x i16>* %b) #0 {
+define void @concat_v16i16_4op(<4 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v16i16_4op:
-; CHECK: ldr d[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: st1h { z[[OP1]].h }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i16>, <4 x i16>* %a
   %shuffle = shufflevector <4 x i16> %op1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %res = shufflevector <8 x i16> %shuffle, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -834,12 +949,13 @@ define void @concat_v16i16_4op(<4 x i16>* %a, <16 x i16>* %b) #0 {
   ret void
 }
 
-define void @concat_v8i32_4op(<2 x i32>* %a, <8 x i32>* %b) #0 {
+define void @concat_v8i32_4op(<2 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v8i32_4op:
-; CHECK: ldr d[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: st1w { z[[OP1]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <2 x i32>, <2 x i32>* %a
   %shuffle = shufflevector <2 x i32> %op1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res = shufflevector <4 x i32> %shuffle, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -847,12 +963,13 @@ define void @concat_v8i32_4op(<2 x i32>* %a, <8 x i32>* %b) #0 {
   ret void
 }
 
-define void @concat_v4i64_4op(<1 x i64>* %a, <4 x i64>* %b) #0 {
+define void @concat_v4i64_4op(<1 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v4i64_4op:
-; CHECK: ldr d[[OP1:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: st1d { z[[OP1]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <1 x i64>, <1 x i64>* %a
   %shuffle = shufflevector <1 x i64> %op1, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
   %res = shufflevector <2 x i64> %shuffle, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
index 9c1ee54d2665..fde767ac4014 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
@@ -1,25 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=1024  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=2048  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
 
 target triple = "aarch64-unknown-linux-gnu"
 
-define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) #0 {
+define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) vscale_range(2,0) #0 {
 ; CHECK-LABEL: load_zext_v4i16i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -49,7 +34,7 @@ define <2 x i256> @load_zext_v2i64i256(<2 x i64>* %ap) #0 {
   ret <2 x i256> %val
 }
 
-define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 {
+define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) vscale_range(2,0) #0 {
 ; CHECK-LABEL: load_zext_v8i16i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -61,103 +46,43 @@ define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 {
   ret <8 x i32> %val
 }
 
-define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_zext_v16i16i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: load_zext_v16i16i32:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_512-NEXT:    ret
-
-  ; Ensure sensible type legalistaion
+define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) vscale_range(4,0) #0 {
+; CHECK-LABEL: load_zext_v16i16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
   %a = load <16 x i16>, <16 x i16>* %ap
   %val = zext <16 x i16> %a to <16 x i32>
   ret <16 x i32> %val
 }
 
-define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_zext_v32i16i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: load_zext_v32i16i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_1024-NEXT:    ret
+define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) vscale_range(8,0) #0 {
+; CHECK-LABEL: load_zext_v32i16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
   %a = load <32 x i16>, <32 x i16>* %ap
   %val = zext <32 x i16> %a to <32 x i32>
   ret <32 x i32> %val
 }
 
 define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_zext_v64i16i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #32
-; VBITS_GE_256-NEXT:    mov x11, #48
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x12, #24
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    uunpklo z4.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z5.s, z1.h
-; VBITS_GE_256-NEXT:    uunpklo z6.s, z2.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x8, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT:    mov x10, #56
-; VBITS_GE_256-NEXT:    mov x11, #40
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z7.s, z3.h
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
+; VBITS_GE_1024-LABEL: load_zext_v64i16i32:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
+; VBITS_GE_1024-NEXT:    mov x9, #32
+; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    uunpklo z1.s, z0.h
+; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
+; VBITS_GE_1024-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_1024-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_1024-NEXT:    ret
 ;
 ; VBITS_GE_2048-LABEL: load_zext_v64i16i32:
 ; VBITS_GE_2048:       // %bb.0:
@@ -170,7 +95,7 @@ define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 {
   ret <64 x i32> %val
 }
 
-define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 {
+define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) vscale_range(2,0) #0 {
 ; CHECK-LABEL: load_sext_v4i16i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -181,7 +106,7 @@ define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 {
   ret <4 x i32> %val
 }
 
-define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 {
+define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) vscale_range(2,0) #0 {
 ; CHECK-LABEL: load_sext_v8i16i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -193,103 +118,43 @@ define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 {
   ret <8 x i32> %val
 }
 
-define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_sext_v16i16i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: load_sext_v16i16i32:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_512-NEXT:    ret
-
-  ; Ensure sensible type legalistaion
+define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) vscale_range(4,0) #0 {
+; CHECK-LABEL: load_sext_v16i16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
   %a = load <16 x i16>, <16 x i16>* %ap
   %val = sext <16 x i16> %a to <16 x i32>
   ret <16 x i32> %val
 }
 
-define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_sext_v32i16i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: load_sext_v32i16i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_1024-NEXT:    ret
+define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) vscale_range(8,0) #0 {
+; CHECK-LABEL: load_sext_v32i16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
   %a = load <32 x i16>, <32 x i16>* %ap
   %val = sext <32 x i16> %a to <32 x i32>
   ret <32 x i32> %val
 }
 
 define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_sext_v64i16i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #32
-; VBITS_GE_256-NEXT:    mov x11, #48
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x12, #24
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    sunpklo z4.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z5.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z6.s, z2.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x8, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT:    mov x10, #56
-; VBITS_GE_256-NEXT:    mov x11, #40
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z7.s, z3.h
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
+; VBITS_GE_1024-LABEL: load_sext_v64i16i32:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
+; VBITS_GE_1024-NEXT:    mov x9, #32
+; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    sunpklo z1.s, z0.h
+; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
+; VBITS_GE_1024-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_1024-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_1024-NEXT:    ret
 ;
 ; VBITS_GE_2048-LABEL: load_sext_v64i16i32:
 ; VBITS_GE_2048:       // %bb.0:
@@ -303,52 +168,22 @@ define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 {
 }
 
 define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_zext_v32i8i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ushll2 v2.8h, v0.16b, #0
-; VBITS_GE_256-NEXT:    ushll v1.8h, v0.8b, #0
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    ushll2 v4.8h, v0.16b, #0
-; VBITS_GE_256-NEXT:    uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z4.h
-; VBITS_GE_256-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
-; VBITS_GE_256-NEXT:    uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x10, #28
-; VBITS_GE_256-NEXT:    uunpklo z2.d, z3.s
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT:    uunpklo z3.s, z4.h
-; VBITS_GE_256-NEXT:    ushll v0.8h, v0.8b, #0
-; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z2.s
-; VBITS_GE_256-NEXT:    mov x9, #20
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z1.h
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z3.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
+; VBITS_GE_1024-LABEL: load_zext_v32i8i64:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_1024-NEXT:    mov x9, #16
+; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT:    uunpklo z1.h, z0.b
+; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_1024-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_1024-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_1024-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT:    ret
 ;
 ; VBITS_GE_2048-LABEL: load_zext_v32i8i64:
 ; VBITS_GE_2048:       // %bb.0:
@@ -362,52 +197,22 @@ define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 {
 }
 
 define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_sext_v32i8i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    sshll2 v2.8h, v0.16b, #0
-; VBITS_GE_256-NEXT:    sshll v1.8h, v0.8b, #0
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    sshll2 v4.8h, v0.16b, #0
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z4.h
-; VBITS_GE_256-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x10, #28
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z3.s
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z3.s, z4.h
-; VBITS_GE_256-NEXT:    sshll v0.8h, v0.8b, #0
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z2.s
-; VBITS_GE_256-NEXT:    mov x9, #20
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z3.h
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
+; VBITS_GE_1024-LABEL: load_sext_v32i8i64:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_1024-NEXT:    mov x9, #16
+; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT:    sunpklo z1.h, z0.b
+; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_1024-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_1024-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_1024-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_1024-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT:    ret
 ;
 ; VBITS_GE_2048-LABEL: load_sext_v32i8i64:
 ; VBITS_GE_2048:       // %bb.0:
@@ -421,50 +226,20 @@ define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 {
 }
 
 define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_zext_v32i16i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #20
-; VBITS_GE_256-NEXT:    uunpklo z4.s, z1.h
-; VBITS_GE_256-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    ext v6.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    uunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    uunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z3.h
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x10, #12
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z6.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z4.s
-; VBITS_GE_256-NEXT:    uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
+; VBITS_GE_1024-LABEL: load_zext_v32i16i64:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_1024-NEXT:    mov x9, #16
+; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT:    uunpklo z1.s, z0.h
+; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #32
+; VBITS_GE_1024-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT:    ret
 ;
 ; VBITS_GE_2048-LABEL: load_zext_v32i16i64:
 ; VBITS_GE_2048:       // %bb.0:
@@ -478,50 +253,20 @@ define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 {
 }
 
 define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_sext_v32i16i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #20
-; VBITS_GE_256-NEXT:    sunpklo z4.s, z1.h
-; VBITS_GE_256-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    ext v6.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z3.h
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x10, #12
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z6.h
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z4.s
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
+; VBITS_GE_1024-LABEL: load_sext_v32i16i64:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_1024-NEXT:    mov x9, #16
+; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT:    sunpklo z1.s, z0.h
+; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #32
+; VBITS_GE_1024-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_1024-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT:    ret
 ;
 ; VBITS_GE_2048-LABEL: load_sext_v32i16i64:
 ; VBITS_GE_2048:       // %bb.0:
@@ -535,42 +280,18 @@ define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 {
 }
 
 define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_zext_v32i32i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x11, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x12, #12
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    uunpklo z4.d, z0.s
-; VBITS_GE_256-NEXT:    uunpklo z5.d, z1.s
-; VBITS_GE_256-NEXT:    uunpklo z6.d, z2.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x8, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x10, #28
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z7.d, z3.s
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
+; VBITS_GE_1024-LABEL: load_zext_v32i32i64:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    mov x9, #16
+; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT:    uunpklo z1.d, z0.s
+; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
+; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT:    ret
 ;
 ; VBITS_GE_2048-LABEL: load_zext_v32i32i64:
 ; VBITS_GE_2048:       // %bb.0:
@@ -584,42 +305,18 @@ define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 {
 }
 
 define <32 x i64> @load_sext_v32i32i64(<32 x i32>* %ap) #0 {
-; VBITS_GE_256-LABEL: load_sext_v32i32i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x11, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x12, #12
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    sunpklo z4.d, z0.s
-; VBITS_GE_256-NEXT:    sunpklo z5.d, z1.s
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z2.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x8, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x10, #28
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z7.d, z3.s
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x8, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
+; VBITS_GE_1024-LABEL: load_sext_v32i32i64:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    mov x9, #16
+; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT:    sunpklo z1.d, z0.s
+; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
+; VBITS_GE_1024-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_1024-NEXT:    ret
 ;
 ; VBITS_GE_2048-LABEL: load_sext_v32i32i64:
 ; VBITS_GE_2048:       // %bb.0:

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
index f454be420905..402e270b5313 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
@@ -1,28 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
 ; i8
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) #0 {
+define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    zip2 v0.8b, v0.8b, v0.8b
@@ -32,7 +18,7 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) #0 {
+define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
@@ -42,7 +28,7 @@ define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) #0 {
   ret <8 x i8> %ret
 }
 
-define void @extract_subvector_v32i8(<32 x i8>* %a, <16 x i8>* %b) #0 {
+define void @extract_subvector_v32i8(<32 x i8>* %a, <16 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b, vl32
@@ -79,62 +65,30 @@ define void @extract_subvector_v64i8(<64 x i8>* %a, <32 x i8>* %b) #0 {
   ret void
 }
 
-define void @extract_subvector_v128i8(<128 x i8>* %a, <64 x i8>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v128i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #64
-; VBITS_GE_256-NEXT:    mov w9, #96
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
-; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0, x9]
-; VBITS_GE_256-NEXT:    mov w8, #32
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x1, x8]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v128i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.b, vl64
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT:    st1b { z0.b }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @extract_subvector_v128i8(<128 x i8>* %a, <64 x i8>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v128i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #64
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %ret = call <64 x i8> @llvm.experimental.vector.extract.v64i8.v128i8(<128 x i8> %op, i64 64)
   store <64 x i8> %ret, <64 x i8>* %b
   ret void
 }
 
-define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v256i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #128
-; VBITS_GE_256-NEXT:    mov w9, #160
-; VBITS_GE_256-NEXT:    mov w10, #224
-; VBITS_GE_256-NEXT:    mov w11, #192
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
-; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0, x9]
-; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0, x10]
-; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x0, x11]
-; VBITS_GE_256-NEXT:    mov w8, #64
-; VBITS_GE_256-NEXT:    mov w9, #96
-; VBITS_GE_256-NEXT:    mov w10, #32
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1b { z3.b }, p0, [x1, x8]
-; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x1, x9]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x1, x10]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v256i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.b, vl128
-; VBITS_GE_2048-NEXT:    ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT:    st1b { z0.b }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: extract_subvector_v256i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %ret = call <128 x i8> @llvm.experimental.vector.extract.v128i8.v256i8(<256 x i8> %op, i64 128)
   store <128 x i8> %ret, <128 x i8>* %b
@@ -144,7 +98,7 @@ define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) #0 {
 ; i16
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) #0 {
+define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -159,7 +113,7 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) #0 {
+define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
@@ -169,7 +123,7 @@ define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) #0 {
   ret <4 x i16> %ret
 }
 
-define void @extract_subvector_v16i16(<16 x i16>* %a, <8 x i16>* %b) #0 {
+define void @extract_subvector_v16i16(<16 x i16>* %a, <8 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v16i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -206,62 +160,30 @@ define void @extract_subvector_v32i16(<32 x i16>* %a, <16 x i16>* %b) #0 {
   ret void
 }
 
-define void @extract_subvector_v64i16(<64 x i16>* %a, <32 x i16>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v64i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #32
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v64i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @extract_subvector_v64i16(<64 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v64i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #64
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %ret = call <32 x i16> @llvm.experimental.vector.extract.v32i16.v64i16(<64 x i16> %op, i64 32)
   store <32 x i16> %ret, <32 x i16>* %b
   ret void
 }
 
-define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v128i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #64
-; VBITS_GE_256-NEXT:    mov x9, #80
-; VBITS_GE_256-NEXT:    mov x10, #112
-; VBITS_GE_256-NEXT:    mov x11, #96
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    mov x8, #32
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v128i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT:    ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: extract_subvector_v128i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %ret = call <64 x i16> @llvm.experimental.vector.extract.v64i16.v128i16(<128 x i16> %op, i64 64)
   store <64 x i16> %ret, <64 x i16>* %b
@@ -271,7 +193,7 @@ define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) #0 {
 ; i32
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) #0 {
+define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -282,7 +204,7 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) #0 {
+define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
@@ -292,7 +214,7 @@ define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) #0 {
   ret <2 x i32> %ret
 }
 
-define void @extract_subvector_v8i32(<8 x i32>* %a, <4 x i32>* %b) #0 {
+define void @extract_subvector_v8i32(<8 x i32>* %a, <4 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -329,62 +251,30 @@ define void @extract_subvector_v16i32(<16 x i32>* %a, <8 x i32>* %b) #0 {
   ret void
 }
 
-define void @extract_subvector_v32i32(<32 x i32>* %a, <16 x i32>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v32i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v32i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @extract_subvector_v32i32(<32 x i32>* %a, <16 x i32>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #64
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %ret = call <16 x i32> @llvm.experimental.vector.extract.v16i32.v32i32(<32 x i32> %op, i64 16)
   store <16 x i32> %ret, <16 x i32>* %b
   ret void
 }
 
-define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v64i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #32
-; VBITS_GE_256-NEXT:    mov x9, #40
-; VBITS_GE_256-NEXT:    mov x10, #56
-; VBITS_GE_256-NEXT:    mov x11, #48
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v64i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: extract_subvector_v64i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %ret = call <32 x i32> @llvm.experimental.vector.extract.v32i32.v64i32(<64 x i32> %op, i64 32)
   store <32 x i32> %ret, <32 x i32>* %b
@@ -394,7 +284,7 @@ define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) #0 {
 ; i64
 
 ; Don't use SVE for 128-bit vectors.
-define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) #0 {
+define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
@@ -404,7 +294,7 @@ define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) #0 {
   ret <1 x i64> %ret
 }
 
-define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) #0 {
+define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -418,23 +308,14 @@ define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) #0 {
   ret void
 }
 
-define void @extract_subvector_v8i64(<8 x i64>* %a, <4 x i64>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v8i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: extract_subvector_v8i64:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_512-NEXT:    ext z0.b, z0.b, z0.b, #32
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_512-NEXT:    ret
+define void @extract_subvector_v8i64(<8 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: extract_subvector_v8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #4
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %ret = call <4 x i64> @llvm.experimental.vector.extract.v4i64.v8i64(<8 x i64> %op, i64 4)
   store <4 x i64> %ret, <4 x i64>* %b
@@ -453,50 +334,20 @@ define void @extract_subvector_v16i64(<16 x i64>* %a, <8 x i64>* %b) #0 {
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v16i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %ret = call <8 x i64> @llvm.experimental.vector.extract.v8i64.v16i64(<16 x i64> %op, i64 8)
   store <8 x i64> %ret, <8 x i64>* %b
   ret void
 }
 
-define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v32i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #20
-; VBITS_GE_256-NEXT:    mov x10, #28
-; VBITS_GE_256-NEXT:    mov x11, #24
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_2048-NEXT:    ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #16
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %ret = call <16 x i64> @llvm.experimental.vector.extract.v16i64.v32i64(<32 x i64> %op, i64 16)
   store <16 x i64> %ret, <16 x i64>* %b
@@ -506,7 +357,7 @@ define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) #0 {
 ; f16
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x half> @extract_subvector_v4f16(<4 x half> %op) #0 {
+define <2 x half> @extract_subvector_v4f16(<4 x half> %op) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extract_subvector_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -517,7 +368,7 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x half> @extract_subvector_v8f16(<8 x half> %op) #0 {
+define <4 x half> @extract_subvector_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
@@ -527,7 +378,7 @@ define <4 x half> @extract_subvector_v8f16(<8 x half> %op) #0 {
   ret <4 x half> %ret
 }
 
-define void @extract_subvector_v16f16(<16 x half>* %a, <8 x half>* %b) #0 {
+define void @extract_subvector_v16f16(<16 x half>* %a, <8 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -564,62 +415,30 @@ define void @extract_subvector_v32f16(<32 x half>* %a, <16 x half>* %b) #0 {
   ret void
 }
 
-define void @extract_subvector_v64f16(<64 x half>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v64f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #32
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v64f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @extract_subvector_v64f16(<64 x half>* %a, <32 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #64
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %ret = call <32 x half> @llvm.experimental.vector.extract.v32f16.v64f16(<64 x half> %op, i64 32)
   store <32 x half> %ret, <32 x half>* %b
   ret void
 }
 
-define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v128f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #64
-; VBITS_GE_256-NEXT:    mov x9, #80
-; VBITS_GE_256-NEXT:    mov x10, #112
-; VBITS_GE_256-NEXT:    mov x11, #96
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    mov x8, #32
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v128f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT:    ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: extract_subvector_v128f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %ret = call <64 x half> @llvm.experimental.vector.extract.v64f16.v128f16(<128 x half> %op, i64 64)
   store <64 x half> %ret, <64 x half>* %b
@@ -629,7 +448,7 @@ define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) #0 {
 ; f32
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x float> @extract_subvector_v2f32(<2 x float> %op) #0 {
+define <1 x float> @extract_subvector_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -640,7 +459,7 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x float> @extract_subvector_v4f32(<4 x float> %op) #0 {
+define <2 x float> @extract_subvector_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
@@ -650,7 +469,7 @@ define <2 x float> @extract_subvector_v4f32(<4 x float> %op) #0 {
   ret <2 x float> %ret
 }
 
-define void @extract_subvector_v8f32(<8 x float>* %a, <4 x float>* %b) #0 {
+define void @extract_subvector_v8f32(<8 x float>* %a, <4 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -687,62 +506,30 @@ define void @extract_subvector_v16f32(<16 x float>* %a, <8 x float>* %b) #0 {
   ret void
 }
 
-define void @extract_subvector_v32f32(<32 x float>* %a, <16 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @extract_subvector_v32f32(<32 x float>* %a, <16 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #64
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %ret = call <16 x float> @llvm.experimental.vector.extract.v16f32.v32f32(<32 x float> %op, i64 16)
   store <16 x float> %ret, <16 x float>* %b
   ret void
 }
 
-define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v64f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #32
-; VBITS_GE_256-NEXT:    mov x9, #40
-; VBITS_GE_256-NEXT:    mov x10, #56
-; VBITS_GE_256-NEXT:    mov x11, #48
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: extract_subvector_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %ret = call <32 x float> @llvm.experimental.vector.extract.v32f32.v64f32(<64 x float> %op, i64 32)
   store <32 x float> %ret, <32 x float>* %b
@@ -752,7 +539,7 @@ define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) #0 {
 ; f64
 
 ; Don't use SVE for 128-bit vectors.
-define <1 x double> @extract_subvector_v2f64(<2 x double> %op) #0 {
+define <1 x double> @extract_subvector_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
@@ -762,7 +549,7 @@ define <1 x double> @extract_subvector_v2f64(<2 x double> %op) #0 {
   ret <1 x double> %ret
 }
 
-define void @extract_subvector_v4f64(<4 x double>* %a, <2 x double>* %b) #0 {
+define void @extract_subvector_v4f64(<4 x double>* %a, <2 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -799,62 +586,30 @@ define void @extract_subvector_v8f64(<8 x double>* %a, <4 x double>* %b) #0 {
   ret void
 }
 
-define void @extract_subvector_v16f64(<16 x double>* %a, <8 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: extract_subvector_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @extract_subvector_v16f64(<16 x double>* %a, <8 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: extract_subvector_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #64
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %ret = call <8 x double> @llvm.experimental.vector.extract.v8f64.v16f64(<16 x double> %op, i64 8)
   store <8 x double> %ret, <8 x double>* %b
   ret void
 }
 
-define void @extract_subvector_v32f64(<32 x double>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: extract_subvector_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #20
-; VBITS_GE_256-NEXT:    mov x10, #28
-; VBITS_GE_256-NEXT:    mov x11, #24
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: extract_subvector_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_2048-NEXT:    ext z0.b, z0.b, z0.b, #128
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @extract_subvector_v32f64(<32 x double>* %a, <16 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: extract_subvector_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %ret = call <16 x double> @llvm.experimental.vector.extract.v16f64.v32f64(<32 x double> %op, i64 16)
   store <16 x double> %ret, <16 x double>* %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
index 800af021c8ce..d8de704a241e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
@@ -1,221 +1,259 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; extractelement
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define half @extractelement_v4f16(<4 x half> %op1) #0 {
+define half @extractelement_v4f16(<4 x half> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v4f16:
-; CHECK:         mov h0, v0.h[3]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h0, v0.h[3]
 ; CHECK-NEXT:    ret
     %r = extractelement <4 x half> %op1, i64 3
     ret half %r
 }
 
 ; Don't use SVE for 128-bit vectors.
-define half @extractelement_v8f16(<8 x half> %op1) #0 {
+define half @extractelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v8f16:
-; CHECK:         mov h0, v0.h[7]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h0, v0.h[7]
 ; CHECK-NEXT:    ret
     %r = extractelement <8 x half> %op1, i64 7
     ret half %r
 }
 
-define half @extractelement_v16f16(<16 x half>* %a) #0 {
+define half @extractelement_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v16f16:
-; VBITS_GE_256:         ptrue   p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h    { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    mov z0.h, z0.h[15]
-; VBITS_GE_256-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z0.h, z0.h[15]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
     %op1 = load <16 x half>, <16 x half>* %a
     %r = extractelement <16 x half> %op1, i64 15
     ret half %r
 }
 
 define half @extractelement_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: extractelement_v32f16:
-; VBITS_GE_512:         ptrue   p0.h, vl32
-; VBITS_GE_512-NEXT:    ld1h    { z0.h }, p0/z, [x0]
+; VBITS_GE_256-LABEL: extractelement_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    mov z0.h, z0.h[15]
+; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: extractelement_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    mov z0.h, z0.h[31]
+; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; VBITS_GE_512-NEXT:    ret
     %op1 = load <32 x half>, <32 x half>* %a
     %r = extractelement <32 x half> %op1, i64 31
     ret half %r
 }
 
-define half @extractelement_v64f16(<64 x half>* %a) #0 {
+define half @extractelement_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extractelement_v64f16:
-; VBITS_GE_1024:         ptrue   p0.h, vl64
-; VBITS_GE_1024-NEXT:    mov w8, #63
-; VBITS_GE_1024-NEXT:    ld1h    { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    whilels p0.h, xzr, x8
-; VBITS_GE_1024-NEXT:    lastb   h0, p0, z0.h
-; VBITS_GE_1024-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    mov w8, #63
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    whilels p0.h, xzr, x8
+; CHECK-NEXT:    lastb h0, p0, z0.h
+; CHECK-NEXT:    ret
     %op1 = load <64 x half>, <64 x half>* %a
     %r = extractelement <64 x half> %op1, i64 63
     ret half %r
 }
 
-define half @extractelement_v128f16(<128 x half>* %a) #0 {
+define half @extractelement_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extractelement_v128f16:
-; VBITS_GE_2048:      ptrue   p0.h, vl128
-; VBITS_GE_2048-NEXT: mov w8, #127
-; VBITS_GE_2048-NEXT: ld1h    { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: whilels p0.h, xzr, x8
-; VBITS_GE_2048-NEXT: lastb   h0, p0, z0.h
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    mov w8, #127
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    whilels p0.h, xzr, x8
+; CHECK-NEXT:    lastb h0, p0, z0.h
+; CHECK-NEXT:    ret
     %op1 = load <128 x half>, <128 x half>* %a
     %r = extractelement <128 x half> %op1, i64 127
     ret half %r
 }
 
 ; Don't use SVE for 64-bit vectors.
-define float @extractelement_v2f32(<2 x float> %op1) #0 {
+define float @extractelement_v2f32(<2 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v2f32:
-; CHECK:         mov s0, v0.s[1]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov s0, v0.s[1]
 ; CHECK-NEXT:    ret
     %r = extractelement <2 x float> %op1, i64 1
     ret float %r
 }
 
 ; Don't use SVE for 128-bit vectors.
-define float @extractelement_v4f32(<4 x float> %op1) #0 {
+define float @extractelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v4f32:
-; CHECK:         mov s0, v0.s[3]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s0, v0.s[3]
 ; CHECK-NEXT:    ret
     %r = extractelement <4 x float> %op1, i64 3
     ret float %r
 }
 
-define float @extractelement_v8f32(<8 x float>* %a) #0 {
+define float @extractelement_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v8f32:
-; VBITS_GE_256:         ptrue   p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w    { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    mov z0.s, z0.s[7]
-; VBITS_GE_256-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z0.s, z0.s[7]
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
     %op1 = load <8 x float>, <8 x float>* %a
     %r = extractelement <8 x float> %op1, i64 7
     ret float %r
 }
 
 define float @extractelement_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: extractelement_v16f32:
-; VBITS_GE_512:         ptrue   p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1w    { z0.s }, p0/z, [x0]
+; VBITS_GE_256-LABEL: extractelement_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    mov z0.s, z0.s[7]
+; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: extractelement_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    mov z0.s, z0.s[15]
+; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; VBITS_GE_512-NEXT:    ret
     %op1 = load <16 x float>, <16 x float>* %a
     %r = extractelement <16 x float> %op1, i64 15
     ret float %r
 }
 
-define float @extractelement_v32f32(<32 x float>* %a) #0 {
+define float @extractelement_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extractelement_v32f32:
-; VBITS_GE_1024:        ptrue   p0.s, vl32
-; VBITS_GE_1024-NEXT:   mov w8, #31
-; VBITS_GE_1024-NEXT:   ld1w    { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:   whilels p0.s, xzr, x8
-; VBITS_GE_1024-NEXT:   lastb   s0, p0, z0.s
-; VBITS_GE_1024-NEXT:   ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    mov w8, #31
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    whilels p0.s, xzr, x8
+; CHECK-NEXT:    lastb s0, p0, z0.s
+; CHECK-NEXT:    ret
     %op1 = load <32 x float>, <32 x float>* %a
     %r = extractelement <32 x float> %op1, i64 31
     ret float %r
 }
 
-define float @extractelement_v64f32(<64 x float>* %a) #0 {
+define float @extractelement_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extractelement_v64f32:
-; VBITS_GE_2048:        ptrue   p0.s, vl64
-; VBITS_GE_2048-NEXT:   mov w8, #63
-; VBITS_GE_2048-NEXT:   ld1w    { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:   whilels p0.s, xzr, x8
-; VBITS_GE_2048-NEXT:   lastb   s0, p0, z0.s
-; VBITS_GE_2048-NEXT:   ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    mov w8, #63
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    whilels p0.s, xzr, x8
+; CHECK-NEXT:    lastb s0, p0, z0.s
+; CHECK-NEXT:    ret
     %op1 = load <64 x float>, <64 x float>* %a
     %r = extractelement <64 x float> %op1, i64 63
     ret float %r
 }
 
 ; Don't use SVE for 64-bit vectors.
-define double @extractelement_v1f64(<1 x double> %op1) #0 {
+define double @extractelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v1f64:
-; CHECK:         ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
     %r = extractelement <1 x double> %op1, i64 0
     ret double %r
 }
 
 ; Don't use SVE for 128-bit vectors.
-define double @extractelement_v2f64(<2 x double> %op1) #0 {
+define double @extractelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v2f64:
-; CHECK:         mov d0, v0.d[1]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d0, v0.d[1]
 ; CHECK-NEXT:    ret
     %r = extractelement <2 x double> %op1, i64 1
     ret double %r
 }
 
-define double @extractelement_v4f64(<4 x double>* %a) #0 {
+define double @extractelement_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v4f64:
-; VBITS_GE_256:         ptrue   p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d    { z0.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    mov z0.d, z0.d[3]
-; VBITS_GE_256-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z0.d, z0.d[3]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
     %op1 = load <4 x double>, <4 x double>* %a
     %r = extractelement <4 x double> %op1, i64 3
     ret double %r
 }
 
 define double @extractelement_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: extractelement_v8f64:
-; VBITS_GE_512:         ptrue   p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d    { z0.d }, p0/z, [x0]
+; VBITS_GE_256-LABEL: extractelement_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    mov z0.d, z0.d[3]
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: extractelement_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    mov z0.d, z0.d[7]
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; VBITS_GE_512-NEXT:    ret
     %op1 = load <8 x double>, <8 x double>* %a
     %r = extractelement <8 x double> %op1, i64 7
     ret double %r
 }
 
-define double @extractelement_v16f64(<16 x double>* %a) #0 {
+define double @extractelement_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extractelement_v16f64:
-; VBITS_GE_1024:         ptrue   p0.d, vl16
-; VBITS_GE_1024-NEXT:    mov w8, #15
-; VBITS_GE_1024-NEXT:    ld1d    { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    whilels p0.d, xzr, x8
-; VBITS_GE_1024-NEXT:    lastb   d0, p0, z0.d
-; VBITS_GE_1024-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    mov w8, #15
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    whilels p0.d, xzr, x8
+; CHECK-NEXT:    lastb d0, p0, z0.d
+; CHECK-NEXT:    ret
     %op1 = load <16 x double>, <16 x double>* %a
     %r = extractelement <16 x double> %op1, i64 15
     ret double %r
 }
 
-define double @extractelement_v32f64(<32 x double>* %a) #0 {
+define double @extractelement_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extractelement_v32f64:
-; VBITS_GE_2048:         ptrue   p0.d, vl32
-; VBITS_GE_2048-NEXT:    mov w8, #31
-; VBITS_GE_2048-NEXT:    ld1d    { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    whilels p0.d, xzr, x8
-; VBITS_GE_2048-NEXT:    lastb   d0, p0, z0.d
-; VBITS_GE_2048-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    mov w8, #31
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    whilels p0.d, xzr, x8
+; CHECK-NEXT:    lastb d0, p0, z0.d
+; CHECK-NEXT:    ret
     %op1 = load <32 x double>, <32 x double>* %a
     %r = extractelement <32 x double> %op1, i64 31
     ret double %r

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
index a390ccccd063..1d588c90b8ef 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
@@ -1,60 +1,43 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256
-
-; VBYTES represents the useful byte size of a vector register from the code
-; generator's point of view. It is clamped to power-of-2 values because
-; only power-of-2 vector lengths are considered legal, regardless of the
-; user specified vector length.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; FADD
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadd_v4f16:
-; CHECK: fadd v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = fadd <4 x half> %op1, %op2
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadd_v8f16:
-; CHECK: fadd v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = fadd <8 x half> %op1, %op2
   ret <8 x half> %res
 }
 
-define void @fadd_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fadd_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadd_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %op2 = load <16 x half>, <16 x half>* %b
   %res = fadd <16 x half> %op1, %op2
@@ -63,18 +46,28 @@ define void @fadd_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 }
 
 define void @fadd_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fadd_v32f16:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-DAG: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-DAG: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_LE_256-DAG: mov x[[IDX_1:[0-9]+]], #[[#div(VBYTES,2)]]
-; VBITS_LE_256-DAG: ld1h { [[OP1_1:z[0-9]+]].h }, [[PG]]/z, [x0, x[[IDX_1]], lsl #1]
-; VBITS_LE_256-DAG: ld1h { [[OP2_1:z[0-9]+]].h }, [[PG]]/z, [x1, x[[IDX_1]], lsl #1]
-; VBITS_LE_256-DAG: fadd [[RES_1:z[0-9]+]].h, [[PG]]/m, [[OP1_1]].h, [[OP2_1]].h
-; VBITS_LE_256-DAG: st1h { [[RES_1]].h }, [[PG]], [x0, x[[IDX_1]], lsl #1]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fadd_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fadd z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fadd_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %op2 = load <32 x half>, <32 x half>* %b
   %res = fadd <32 x half> %op1, %op2
@@ -82,29 +75,15 @@ define void @fadd_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
   ret void
 }
 
-define void @fadd_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fadd_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fadd_v64f16:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-DAG: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-DAG: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_LE_512-DAG: mov x[[IDX_1:[0-9]+]], #[[#div(VBYTES,2)]]
-; VBITS_LE_512-DAG: ld1h { [[OP1_1:z[0-9]+]].h }, [[PG]]/z, [x0, x[[IDX_1]], lsl #1]
-; VBITS_LE_512-DAG: ld1h { [[OP2_1:z[0-9]+]].h }, [[PG]]/z, [x1, x[[IDX_1]], lsl #1]
-; VBITS_LE_512-DAG: fadd [[RES_1:z[0-9]+]].h, [[PG]]/m, [[OP1_1]].h, [[OP2_1]].h
-; VBITS_LE_512-DAG: st1h { [[RES_1]].h }, [[PG]], [x0, x[[IDX_1]], lsl #1]
-; VBITS_LE_256-DAG: mov x[[IDX_2:[0-9]+]], #[[#mul(div(VBYTES,2),2)]]
-; VBITS_LE_256-DAG: ld1h { [[OP1_2:z[0-9]+]].h }, [[PG]]/z, [x0, x[[IDX_2]], lsl #1]
-; VBITS_LE_256-DAG: ld1h { [[OP2_2:z[0-9]+]].h }, [[PG]]/z, [x1, x[[IDX_2]], lsl #1]
-; VBITS_LE_256-DAG: fadd [[RES_2:z[0-9]+]].h, [[PG]]/m, [[OP1_2]].h, [[OP2_2]].h
-; VBITS_LE_256-DAG: st1h { [[RES_2]].h }, [[PG]], [x0, x[[IDX_2]], lsl #1]
-; VBITS_LE_256-DAG: mov x[[IDX_3:[0-9]+]], #[[#mul(div(VBYTES,2),3)]]
-; VBITS_LE_256-DAG: ld1h { [[OP1_3:z[0-9]+]].h }, [[PG]]/z, [x0, x[[IDX_3]], lsl #1]
-; VBITS_LE_256-DAG: ld1h { [[OP2_3:z[0-9]+]].h }, [[PG]]/z, [x1, x[[IDX_3]], lsl #1]
-; VBITS_LE_256-DAG: fadd [[RES_3:z[0-9]+]].h, [[PG]]/m, [[OP1_3]].h, [[OP2_3]].h
-; VBITS_LE_256-DAG: st1h { [[RES_3]].h }, [[PG]], [x0, x[[IDX_3]], lsl #1]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %res = fadd <64 x half> %op1, %op2
@@ -112,16 +91,15 @@ define void @fadd_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
-define void @fadd_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fadd_v128f16(<128 x half>* %a, <128 x half>* %b)  vscale_range(16,0) #0 {
 ; CHECK-LABEL: fadd_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %op2 = load <128 x half>, <128 x half>* %b
   %res = fadd <128 x half> %op1, %op2
@@ -130,31 +108,34 @@ define void @fadd_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadd_v2f32:
-; CHECK: fadd v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = fadd <2 x float> %op1, %op2
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadd_v4f32:
-; CHECK: fadd v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = fadd <4 x float> %op1, %op2
   ret <4 x float> %res
 }
 
-define void @fadd_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fadd_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadd_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %op2 = load <8 x float>, <8 x float>* %b
   %res = fadd <8 x float> %op1, %op2
@@ -162,16 +143,29 @@ define void @fadd_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
 define void @fadd_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fadd_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fadd_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fadd_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %op2 = load <16 x float>, <16 x float>* %b
   %res = fadd <16 x float> %op1, %op2
@@ -179,16 +173,15 @@ define void @fadd_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
-define void @fadd_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fadd_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fadd_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %res = fadd <32 x float> %op1, %op2
@@ -196,16 +189,15 @@ define void @fadd_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
-define void @fadd_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fadd_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fadd_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %op2 = load <64 x float>, <64 x float>* %b
   %res = fadd <64 x float> %op1, %op2
@@ -214,31 +206,34 @@ define void @fadd_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fadd_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fadd_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadd_v1f64:
-; CHECK: fadd d0, d0, d1
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    ret
   %res = fadd <1 x double> %op1, %op2
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadd_v2f64:
-; CHECK: fadd v0.2d, v0.2d, v1.2d
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = fadd <2 x double> %op1, %op2
   ret <2 x double> %res
 }
 
-define void @fadd_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fadd_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadd_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %op2 = load <4 x double>, <4 x double>* %b
   %res = fadd <4 x double> %op1, %op2
@@ -246,16 +241,29 @@ define void @fadd_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
 define void @fadd_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fadd_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fadd_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fadd z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fadd_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %op2 = load <8 x double>, <8 x double>* %b
   %res = fadd <8 x double> %op1, %op2
@@ -263,16 +271,15 @@ define void @fadd_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
-define void @fadd_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fadd_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fadd_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %res = fadd <16 x double> %op1, %op2
@@ -280,16 +287,15 @@ define void @fadd_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the fadd_v#f16 tests
-; already cover the general legalisation cases.
-define void @fadd_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fadd_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fadd_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %op2 = load <32 x double>, <32 x double>* %b
   %res = fadd <32 x double> %op1, %op2
@@ -297,41 +303,39 @@ define void @fadd_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
   ret void
 }
 
-;
-; NOTE: Tests beyond this point only have CHECK lines to validate the first
-; VBYTES because the fadd tests already validate the legalisation code paths.
-;
-
 ;
 ; FDIV
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fdiv_v4f16:
-; CHECK: fdiv v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = fdiv <4 x half> %op1, %op2
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fdiv_v8f16:
-; CHECK: fdiv v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = fdiv <8 x half> %op1, %op2
   ret <8 x half> %res
 }
 
-define void @fdiv_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fdiv_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fdiv_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %op2 = load <16 x half>, <16 x half>* %b
   %res = fdiv <16 x half> %op1, %op2
@@ -340,13 +344,28 @@ define void @fdiv_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 }
 
 define void @fdiv_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fdiv_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fdiv_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fdiv z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    fdiv z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fdiv_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %op2 = load <32 x half>, <32 x half>* %b
   %res = fdiv <32 x half> %op1, %op2
@@ -354,14 +373,15 @@ define void @fdiv_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
   ret void
 }
 
-define void @fdiv_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fdiv_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fdiv_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %res = fdiv <64 x half> %op1, %op2
@@ -369,14 +389,15 @@ define void @fdiv_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
   ret void
 }
 
-define void @fdiv_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fdiv_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fdiv_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %op2 = load <128 x half>, <128 x half>* %b
   %res = fdiv <128 x half> %op1, %op2
@@ -385,31 +406,34 @@ define void @fdiv_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fdiv_v2f32:
-; CHECK: fdiv v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = fdiv <2 x float> %op1, %op2
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fdiv_v4f32:
-; CHECK: fdiv v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = fdiv <4 x float> %op1, %op2
   ret <4 x float> %res
 }
 
-define void @fdiv_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fdiv_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fdiv_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %op2 = load <8 x float>, <8 x float>* %b
   %res = fdiv <8 x float> %op1, %op2
@@ -418,13 +442,28 @@ define void @fdiv_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
 }
 
 define void @fdiv_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fdiv_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fdiv_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fdiv z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    fdiv z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fdiv_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %op2 = load <16 x float>, <16 x float>* %b
   %res = fdiv <16 x float> %op1, %op2
@@ -432,14 +471,15 @@ define void @fdiv_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @fdiv_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fdiv_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fdiv_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %res = fdiv <32 x float> %op1, %op2
@@ -447,14 +487,15 @@ define void @fdiv_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
   ret void
 }
 
-define void @fdiv_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fdiv_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fdiv_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %op2 = load <64 x float>, <64 x float>* %b
   %res = fdiv <64 x float> %op1, %op2
@@ -463,31 +504,34 @@ define void @fdiv_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fdiv_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fdiv_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fdiv_v1f64:
-; CHECK: fdiv d0, d0, d1
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv d0, d0, d1
+; CHECK-NEXT:    ret
   %res = fdiv <1 x double> %op1, %op2
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fdiv_v2f64:
-; CHECK: fdiv v0.2d, v0.2d, v1.2d
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = fdiv <2 x double> %op1, %op2
   ret <2 x double> %res
 }
 
-define void @fdiv_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fdiv_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fdiv_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %op2 = load <4 x double>, <4 x double>* %b
   %res = fdiv <4 x double> %op1, %op2
@@ -496,13 +540,28 @@ define void @fdiv_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
 }
 
 define void @fdiv_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fdiv_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fdiv_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fdiv z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    fdiv z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fdiv_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %op2 = load <8 x double>, <8 x double>* %b
   %res = fdiv <8 x double> %op1, %op2
@@ -510,14 +569,15 @@ define void @fdiv_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @fdiv_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fdiv_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fdiv_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %res = fdiv <16 x double> %op1, %op2
@@ -525,14 +585,15 @@ define void @fdiv_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
   ret void
 }
 
-define void @fdiv_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fdiv_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fdiv_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %op2 = load <32 x double>, <32 x double>* %b
   %res = fdiv <32 x double> %op1, %op2
@@ -545,32 +606,37 @@ define void @fdiv_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) #0 {
+define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v4f16:
-; CHECK: fmla v2.4h, v1.4h, v0.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla v2.4h, v1.4h, v0.4h
+; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) #0 {
+define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v8f16:
-; CHECK: fmla v2.8h, v1.8h, v0.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla v2.8h, v1.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
   ret <8 x half> %res
 }
 
-define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 {
+define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
-; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x2]
+; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %op2 = load <16 x half>, <16 x half>* %b
   %op3 = load <16 x half>, <16 x half>* %c
@@ -580,14 +646,31 @@ define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 {
 }
 
 define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
-; CHECK-LABEL: fma_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
-; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fma_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x2]
+; VBITS_GE_256-NEXT:    fmad z0.h, p0/m, z2.h, z4.h
+; VBITS_GE_256-NEXT:    fmad z1.h, p0/m, z3.h, z5.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fma_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x2]
+; VBITS_GE_512-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %op2 = load <32 x half>, <32 x half>* %b
   %op3 = load <32 x half>, <32 x half>* %c
@@ -596,15 +679,16 @@ define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
   ret void
 }
 
-define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 {
+define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fma_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
-; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x2]
+; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %op3 = load <64 x half>, <64 x half>* %c
@@ -613,15 +697,16 @@ define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 {
   ret void
 }
 
-define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #0 {
+define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fma_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
-; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x2]
+; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %op2 = load <128 x half>, <128 x half>* %b
   %op3 = load <128 x half>, <128 x half>* %c
@@ -631,32 +716,37 @@ define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) #0 {
+define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v2f32:
-; CHECK: fmla v2.2s, v1.2s, v0.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla v2.2s, v1.2s, v0.2s
+; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) #0 {
+define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v4f32:
-; CHECK: fmla v2.4s, v1.4s, v0.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3)
   ret <4 x float> %res
 }
 
-define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 {
+define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
-; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2]
+; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %op2 = load <8 x float>, <8 x float>* %b
   %op3 = load <8 x float>, <8 x float>* %c
@@ -666,14 +756,31 @@ define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 {
 }
 
 define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0 {
-; CHECK-LABEL: fma_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
-; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fma_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x2]
+; VBITS_GE_256-NEXT:    fmad z0.s, p0/m, z2.s, z4.s
+; VBITS_GE_256-NEXT:    fmad z1.s, p0/m, z3.s, z5.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fma_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x2]
+; VBITS_GE_512-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %op2 = load <16 x float>, <16 x float>* %b
   %op3 = load <16 x float>, <16 x float>* %c
@@ -682,15 +789,16 @@ define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0
   ret void
 }
 
-define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0 {
+define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fma_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
-; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2]
+; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %op3 = load <32 x float>, <32 x float>* %c
@@ -699,15 +807,16 @@ define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0
   ret void
 }
 
-define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0 {
+define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fma_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
-; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2]
+; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %op2 = load <64 x float>, <64 x float>* %b
   %op3 = load <64 x float>, <64 x float>* %c
@@ -717,32 +826,36 @@ define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) #0 {
+define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v1f64:
-; CHECK: fmadd d0, d0, d1, d2
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d0, d1, d2
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.fma.v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) #0 {
+define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v2f64:
-; CHECK: fmla v2.2d, v1.2d, v0.2d
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla v2.2d, v1.2d, v0.2d
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3)
   ret <2 x double> %res
 }
 
-define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0 {
+define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
-; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
+; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %op2 = load <4 x double>, <4 x double>* %b
   %op3 = load <4 x double>, <4 x double>* %c
@@ -752,14 +865,31 @@ define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0
 }
 
 define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0 {
-; CHECK-LABEL: fma_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
-; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fma_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x2]
+; VBITS_GE_256-NEXT:    fmad z0.d, p0/m, z2.d, z4.d
+; VBITS_GE_256-NEXT:    fmad z1.d, p0/m, z3.d, z5.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fma_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x2]
+; VBITS_GE_512-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %op2 = load <8 x double>, <8 x double>* %b
   %op3 = load <8 x double>, <8 x double>* %c
@@ -768,15 +898,16 @@ define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0
   ret void
 }
 
-define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) #0 {
+define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fma_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
-; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
+; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %op3 = load <16 x double>, <16 x double>* %c
@@ -785,15 +916,16 @@ define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c)
   ret void
 }
 
-define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) #0 {
+define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fma_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
-; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
+; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %op2 = load <32 x double>, <32 x double>* %b
   %op3 = load <32 x double>, <32 x double>* %c
@@ -807,31 +939,34 @@ define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c)
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmul_v4f16:
-; CHECK: fmul v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = fmul <4 x half> %op1, %op2
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmul_v8f16:
-; CHECK: fmul v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = fmul <8 x half> %op1, %op2
   ret <8 x half> %res
 }
 
-define void @fmul_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fmul_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmul_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %op2 = load <16 x half>, <16 x half>* %b
   %res = fmul <16 x half> %op1, %op2
@@ -840,13 +975,28 @@ define void @fmul_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 }
 
 define void @fmul_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fmul_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fmul_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fmul z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmul_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %op2 = load <32 x half>, <32 x half>* %b
   %res = fmul <32 x half> %op1, %op2
@@ -854,14 +1004,15 @@ define void @fmul_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
   ret void
 }
 
-define void @fmul_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fmul_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmul_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %res = fmul <64 x half> %op1, %op2
@@ -869,14 +1020,15 @@ define void @fmul_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
   ret void
 }
 
-define void @fmul_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fmul_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmul_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %op2 = load <128 x half>, <128 x half>* %b
   %res = fmul <128 x half> %op1, %op2
@@ -885,31 +1037,34 @@ define void @fmul_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmul_v2f32:
-; CHECK: fmul v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = fmul <2 x float> %op1, %op2
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmul_v4f32:
-; CHECK: fmul v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = fmul <4 x float> %op1, %op2
   ret <4 x float> %res
 }
 
-define void @fmul_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fmul_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmul_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %op2 = load <8 x float>, <8 x float>* %b
   %res = fmul <8 x float> %op1, %op2
@@ -918,13 +1073,28 @@ define void @fmul_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
 }
 
 define void @fmul_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fmul_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fmul_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fmul z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmul_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %op2 = load <16 x float>, <16 x float>* %b
   %res = fmul <16 x float> %op1, %op2
@@ -932,14 +1102,15 @@ define void @fmul_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @fmul_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fmul_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmul_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %res = fmul <32 x float> %op1, %op2
@@ -947,14 +1118,15 @@ define void @fmul_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
   ret void
 }
 
-define void @fmul_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fmul_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmul_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %op2 = load <64 x float>, <64 x float>* %b
   %res = fmul <64 x float> %op1, %op2
@@ -963,31 +1135,34 @@ define void @fmul_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fmul_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fmul_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmul_v1f64:
-; CHECK: fmul d0, d0, d1
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul d0, d0, d1
+; CHECK-NEXT:    ret
   %res = fmul <1 x double> %op1, %op2
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmul_v2f64:
-; CHECK: fmul v0.2d, v0.2d, v1.2d
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = fmul <2 x double> %op1, %op2
   ret <2 x double> %res
 }
 
-define void @fmul_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fmul_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmul_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %op2 = load <4 x double>, <4 x double>* %b
   %res = fmul <4 x double> %op1, %op2
@@ -996,13 +1171,28 @@ define void @fmul_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
 }
 
 define void @fmul_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fmul_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fmul_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fmul z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    fmul z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmul_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %op2 = load <8 x double>, <8 x double>* %b
   %res = fmul <8 x double> %op1, %op2
@@ -1010,14 +1200,15 @@ define void @fmul_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @fmul_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fmul_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmul_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %res = fmul <16 x double> %op1, %op2
@@ -1025,14 +1216,15 @@ define void @fmul_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
   ret void
 }
 
-define void @fmul_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fmul_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmul_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fmul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %op2 = load <32 x double>, <32 x double>* %b
   %res = fmul <32 x double> %op1, %op2
@@ -1045,30 +1237,33 @@ define void @fmul_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fneg_v4f16(<4 x half> %op) #0 {
+define <4 x half> @fneg_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fneg_v4f16:
-; CHECK: fneg v0.4h, v0.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fneg v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = fneg <4 x half> %op
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fneg_v8f16(<8 x half> %op) #0 {
+define <8 x half> @fneg_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fneg_v8f16:
-; CHECK: fneg v0.8h, v0.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fneg v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = fneg <8 x half> %op
   ret <8 x half> %res
 }
 
-define void @fneg_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fneg_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fneg_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = fneg <16 x half> %op
   store <16 x half> %res, <16 x half>* %a
@@ -1076,38 +1271,53 @@ define void @fneg_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 }
 
 define void @fneg_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: fneg_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fneg_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fneg z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    fneg z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fneg_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fneg z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = fneg <32 x half> %op
   store <32 x half> %res, <32 x half>* %a
   ret void
 }
 
-define void @fneg_v64f16(<64 x half>* %a) #0 {
+define void @fneg_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fneg_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = fneg <64 x half> %op
   store <64 x half> %res, <64 x half>* %a
   ret void
 }
 
-define void @fneg_v128f16(<128 x half>* %a) #0 {
+define void @fneg_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fneg_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = fneg <128 x half> %op
   store <128 x half> %res, <128 x half>* %a
@@ -1115,30 +1325,33 @@ define void @fneg_v128f16(<128 x half>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fneg_v2f32(<2 x float> %op) #0 {
+define <2 x float> @fneg_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fneg_v2f32:
-; CHECK: fneg v0.2s, v0.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fneg v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = fneg <2 x float> %op
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fneg_v4f32(<4 x float> %op) #0 {
+define <4 x float> @fneg_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fneg_v4f32:
-; CHECK: fneg v0.4s, v0.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fneg v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = fneg <4 x float> %op
   ret <4 x float> %res
 }
 
-define void @fneg_v8f32(<8 x float>* %a) #0 {
+define void @fneg_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fneg_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = fneg <8 x float> %op
   store <8 x float> %res, <8 x float>* %a
@@ -1146,38 +1359,53 @@ define void @fneg_v8f32(<8 x float>* %a) #0 {
 }
 
 define void @fneg_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: fneg_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fneg_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fneg z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    fneg z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fneg_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fneg z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = fneg <16 x float> %op
   store <16 x float> %res, <16 x float>* %a
   ret void
 }
 
-define void @fneg_v32f32(<32 x float>* %a) #0 {
+define void @fneg_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fneg_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = fneg <32 x float> %op
   store <32 x float> %res, <32 x float>* %a
   ret void
 }
 
-define void @fneg_v64f32(<64 x float>* %a) #0 {
+define void @fneg_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fneg_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = fneg <64 x float> %op
   store <64 x float> %res, <64 x float>* %a
@@ -1185,30 +1413,33 @@ define void @fneg_v64f32(<64 x float>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fneg_v1f64(<1 x double> %op) #0 {
+define <1 x double> @fneg_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fneg_v1f64:
-; CHECK: fneg d0, d0
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fneg d0, d0
+; CHECK-NEXT:    ret
   %res = fneg <1 x double> %op
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fneg_v2f64(<2 x double> %op) #0 {
+define <2 x double> @fneg_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fneg_v2f64:
-; CHECK: fneg v0.2d, v0.2d
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fneg v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = fneg <2 x double> %op
   ret <2 x double> %res
 }
 
-define void @fneg_v4f64(<4 x double>* %a) #0 {
+define void @fneg_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fneg_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = fneg <4 x double> %op
   store <4 x double> %res, <4 x double>* %a
@@ -1216,38 +1447,53 @@ define void @fneg_v4f64(<4 x double>* %a) #0 {
 }
 
 define void @fneg_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: fneg_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fneg_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fneg z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    fneg z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fneg_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fneg z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = fneg <8 x double> %op
   store <8 x double> %res, <8 x double>* %a
   ret void
 }
 
-define void @fneg_v16f64(<16 x double>* %a) #0 {
+define void @fneg_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fneg_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = fneg <16 x double> %op
   store <16 x double> %res, <16 x double>* %a
   ret void
 }
 
-define void @fneg_v32f64(<32 x double>* %a) #0 {
+define void @fneg_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fneg_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fneg [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = fneg <32 x double> %op
   store <32 x double> %res, <32 x double>* %a
@@ -1259,30 +1505,33 @@ define void @fneg_v32f64(<32 x double>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fsqrt_v4f16(<4 x half> %op) #0 {
+define <4 x half> @fsqrt_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsqrt_v4f16:
-; CHECK: fsqrt v0.4h, v0.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsqrt v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fsqrt_v8f16(<8 x half> %op) #0 {
+define <8 x half> @fsqrt_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsqrt_v8f16:
-; CHECK: fsqrt v0.8h, v0.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsqrt v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
 
-define void @fsqrt_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fsqrt_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsqrt_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op)
   store <16 x half> %res, <16 x half>* %a
@@ -1290,38 +1539,53 @@ define void @fsqrt_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 }
 
 define void @fsqrt_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: fsqrt_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fsqrt_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fsqrt z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    fsqrt z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fsqrt_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fsqrt z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %op)
   store <32 x half> %res, <32 x half>* %a
   ret void
 }
 
-define void @fsqrt_v64f16(<64 x half>* %a) #0 {
+define void @fsqrt_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fsqrt_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call <64 x half> @llvm.sqrt.v64f16(<64 x half> %op)
   store <64 x half> %res, <64 x half>* %a
   ret void
 }
 
-define void @fsqrt_v128f16(<128 x half>* %a) #0 {
+define void @fsqrt_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fsqrt_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call <128 x half> @llvm.sqrt.v128f16(<128 x half> %op)
   store <128 x half> %res, <128 x half>* %a
@@ -1329,30 +1593,33 @@ define void @fsqrt_v128f16(<128 x half>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fsqrt_v2f32(<2 x float> %op) #0 {
+define <2 x float> @fsqrt_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsqrt_v2f32:
-; CHECK: fsqrt v0.2s, v0.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsqrt v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fsqrt_v4f32(<4 x float> %op) #0 {
+define <4 x float> @fsqrt_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsqrt_v4f32:
-; CHECK: fsqrt v0.4s, v0.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsqrt v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
 
-define void @fsqrt_v8f32(<8 x float>* %a) #0 {
+define void @fsqrt_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsqrt_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op)
   store <8 x float> %res, <8 x float>* %a
@@ -1360,38 +1627,53 @@ define void @fsqrt_v8f32(<8 x float>* %a) #0 {
 }
 
 define void @fsqrt_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: fsqrt_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fsqrt_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fsqrt z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    fsqrt z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fsqrt_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fsqrt z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %op)
   store <16 x float> %res, <16 x float>* %a
   ret void
 }
 
-define void @fsqrt_v32f32(<32 x float>* %a) #0 {
+define void @fsqrt_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fsqrt_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call <32 x float> @llvm.sqrt.v32f32(<32 x float> %op)
   store <32 x float> %res, <32 x float>* %a
   ret void
 }
 
-define void @fsqrt_v64f32(<64 x float>* %a) #0 {
+define void @fsqrt_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fsqrt_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call <64 x float> @llvm.sqrt.v64f32(<64 x float> %op)
   store <64 x float> %res, <64 x float>* %a
@@ -1399,30 +1681,33 @@ define void @fsqrt_v64f32(<64 x float>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fsqrt_v1f64(<1 x double> %op) #0 {
+define <1 x double> @fsqrt_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsqrt_v1f64:
-; CHECK: fsqrt d0, d0
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsqrt d0, d0
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fsqrt_v2f64(<2 x double> %op) #0 {
+define <2 x double> @fsqrt_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsqrt_v2f64:
-; CHECK: fsqrt v0.2d, v0.2d
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsqrt v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
 
-define void @fsqrt_v4f64(<4 x double>* %a) #0 {
+define void @fsqrt_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsqrt_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op)
   store <4 x double> %res, <4 x double>* %a
@@ -1430,38 +1715,53 @@ define void @fsqrt_v4f64(<4 x double>* %a) #0 {
 }
 
 define void @fsqrt_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: fsqrt_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fsqrt_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fsqrt z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    fsqrt z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fsqrt_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fsqrt z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %op)
   store <8 x double> %res, <8 x double>* %a
   ret void
 }
 
-define void @fsqrt_v16f64(<16 x double>* %a) #0 {
+define void @fsqrt_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fsqrt_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call <16 x double> @llvm.sqrt.v16f64(<16 x double> %op)
   store <16 x double> %res, <16 x double>* %a
   ret void
 }
 
-define void @fsqrt_v32f64(<32 x double>* %a) #0 {
+define void @fsqrt_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fsqrt_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fsqrt [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call <32 x double> @llvm.sqrt.v32f64(<32 x double> %op)
   store <32 x double> %res, <32 x double>* %a
@@ -1473,31 +1773,34 @@ define void @fsqrt_v32f64(<32 x double>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsub_v4f16:
-; CHECK: fsub v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = fsub <4 x half> %op1, %op2
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsub_v8f16:
-; CHECK: fsub v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsub v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = fsub <8 x half> %op1, %op2
   ret <8 x half> %res
 }
 
-define void @fsub_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fsub_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsub_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %op2 = load <16 x half>, <16 x half>* %b
   %res = fsub <16 x half> %op1, %op2
@@ -1506,13 +1809,28 @@ define void @fsub_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 }
 
 define void @fsub_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fsub_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fsub_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fsub z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    fsub z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fsub_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %op2 = load <32 x half>, <32 x half>* %b
   %res = fsub <32 x half> %op1, %op2
@@ -1520,14 +1838,15 @@ define void @fsub_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
   ret void
 }
 
-define void @fsub_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fsub_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fsub_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %res = fsub <64 x half> %op1, %op2
@@ -1535,14 +1854,15 @@ define void @fsub_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
   ret void
 }
 
-define void @fsub_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fsub_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fsub_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %op2 = load <128 x half>, <128 x half>* %b
   %res = fsub <128 x half> %op1, %op2
@@ -1551,31 +1871,34 @@ define void @fsub_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsub_v2f32:
-; CHECK: fsub v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = fsub <2 x float> %op1, %op2
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsub_v4f32:
-; CHECK: fsub v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = fsub <4 x float> %op1, %op2
   ret <4 x float> %res
 }
 
-define void @fsub_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fsub_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsub_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %op2 = load <8 x float>, <8 x float>* %b
   %res = fsub <8 x float> %op1, %op2
@@ -1584,13 +1907,28 @@ define void @fsub_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
 }
 
 define void @fsub_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fsub_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fsub_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fsub z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    fsub z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fsub_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %op2 = load <16 x float>, <16 x float>* %b
   %res = fsub <16 x float> %op1, %op2
@@ -1598,14 +1936,15 @@ define void @fsub_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @fsub_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fsub_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fsub_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %res = fsub <32 x float> %op1, %op2
@@ -1613,14 +1952,15 @@ define void @fsub_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
   ret void
 }
 
-define void @fsub_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fsub_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fsub_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %op2 = load <64 x float>, <64 x float>* %b
   %res = fsub <64 x float> %op1, %op2
@@ -1629,31 +1969,34 @@ define void @fsub_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fsub_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fsub_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsub_v1f64:
-; CHECK: fsub d0, d0, d1
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsub d0, d0, d1
+; CHECK-NEXT:    ret
   %res = fsub <1 x double> %op1, %op2
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsub_v2f64:
-; CHECK: fsub v0.2d, v0.2d, v1.2d
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = fsub <2 x double> %op1, %op2
   ret <2 x double> %res
 }
 
-define void @fsub_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fsub_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fsub_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %op2 = load <4 x double>, <4 x double>* %b
   %res = fsub <4 x double> %op1, %op2
@@ -1662,13 +2005,28 @@ define void @fsub_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
 }
 
 define void @fsub_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fsub_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fsub_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fsub z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    fsub z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fsub_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %op2 = load <8 x double>, <8 x double>* %b
   %res = fsub <8 x double> %op1, %op2
@@ -1676,14 +2034,15 @@ define void @fsub_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @fsub_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fsub_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fsub_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %res = fsub <16 x double> %op1, %op2
@@ -1691,14 +2050,15 @@ define void @fsub_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
   ret void
 }
 
-define void @fsub_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fsub_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fsub_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: fsub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %op2 = load <32 x double>, <32 x double>* %b
   %res = fsub <32 x double> %op1, %op2
@@ -1711,30 +2071,33 @@ define void @fsub_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fabs_v4f16(<4 x half> %op) #0 {
+define <4 x half> @fabs_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fabs_v4f16:
-; CHECK: fabs v0.4h, v0.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fabs v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fabs_v8f16(<8 x half> %op) #0 {
+define <8 x half> @fabs_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fabs_v8f16:
-; CHECK: fabs v0.8h, v0.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fabs v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
 
-define void @fabs_v16f16(<16 x half>* %a) #0 {
+define void @fabs_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fabs_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op)
   store <16 x half> %res, <16 x half>* %a
@@ -1742,38 +2105,53 @@ define void @fabs_v16f16(<16 x half>* %a) #0 {
 }
 
 define void @fabs_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: fabs_v32f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fabs_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fabs z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    fabs z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fabs_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fabs z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call <32 x half> @llvm.fabs.v32f16(<32 x half> %op)
   store <32 x half> %res, <32 x half>* %a
   ret void
 }
 
-define void @fabs_v64f16(<64 x half>* %a) #0 {
+define void @fabs_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fabs_v64f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call <64 x half> @llvm.fabs.v64f16(<64 x half> %op)
   store <64 x half> %res, <64 x half>* %a
   ret void
 }
 
-define void @fabs_v128f16(<128 x half>* %a) #0 {
+define void @fabs_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fabs_v128f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call <128 x half> @llvm.fabs.v128f16(<128 x half> %op)
   store <128 x half> %res, <128 x half>* %a
@@ -1781,30 +2159,33 @@ define void @fabs_v128f16(<128 x half>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fabs_v2f32(<2 x float> %op) #0 {
+define <2 x float> @fabs_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fabs_v2f32:
-; CHECK: fabs v0.2s, v0.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fabs v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fabs_v4f32(<4 x float> %op) #0 {
+define <4 x float> @fabs_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fabs_v4f32:
-; CHECK: fabs v0.4s, v0.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fabs v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
 
-define void @fabs_v8f32(<8 x float>* %a) #0 {
+define void @fabs_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fabs_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op)
   store <8 x float> %res, <8 x float>* %a
@@ -1812,38 +2193,53 @@ define void @fabs_v8f32(<8 x float>* %a) #0 {
 }
 
 define void @fabs_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: fabs_v16f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fabs_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fabs z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    fabs z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fabs_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fabs z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call <16 x float> @llvm.fabs.v16f32(<16 x float> %op)
   store <16 x float> %res, <16 x float>* %a
   ret void
 }
 
-define void @fabs_v32f32(<32 x float>* %a) #0 {
+define void @fabs_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fabs_v32f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call <32 x float> @llvm.fabs.v32f32(<32 x float> %op)
   store <32 x float> %res, <32 x float>* %a
   ret void
 }
 
-define void @fabs_v64f32(<64 x float>* %a) #0 {
+define void @fabs_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fabs_v64f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call <64 x float> @llvm.fabs.v64f32(<64 x float> %op)
   store <64 x float> %res, <64 x float>* %a
@@ -1851,30 +2247,33 @@ define void @fabs_v64f32(<64 x float>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fabs_v1f64(<1 x double> %op) #0 {
+define <1 x double> @fabs_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fabs_v1f64:
-; CHECK: fabs d0, d0
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fabs d0, d0
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.fabs.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fabs_v2f64(<2 x double> %op) #0 {
+define <2 x double> @fabs_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fabs_v2f64:
-; CHECK: fabs v0.2d, v0.2d
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fabs v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
 
-define void @fabs_v4f64(<4 x double>* %a) #0 {
+define void @fabs_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fabs_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op)
   store <4 x double> %res, <4 x double>* %a
@@ -1882,38 +2281,53 @@ define void @fabs_v4f64(<4 x double>* %a) #0 {
 }
 
 define void @fabs_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: fabs_v8f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: fabs_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fabs z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    fabs z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fabs_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fabs z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call <8 x double> @llvm.fabs.v8f64(<8 x double> %op)
   store <8 x double> %res, <8 x double>* %a
   ret void
 }
 
-define void @fabs_v16f64(<16 x double>* %a) #0 {
+define void @fabs_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fabs_v16f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call <16 x double> @llvm.fabs.v16f64(<16 x double> %op)
   store <16 x double> %res, <16 x double>* %a
   ret void
 }
 
-define void @fabs_v32f64(<32 x double>* %a) #0 {
+define void @fabs_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fabs_v32f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: fabs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call <32 x double> @llvm.fabs.v32f64(<32 x double> %op)
   store <32 x double> %res, <32 x double>* %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll
similarity index 57%
rename from llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll
rename to llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll
index 2180a405b53c..db7bef039f66 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll
@@ -1,21 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep 'z[0-9]'
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_oeq_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcmeq v0.4h, v0.4h, v1.4h
@@ -35,7 +21,7 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_oeq_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcmeq v0.8h, v0.8h, v1.8h
@@ -45,7 +31,7 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
   ret <8 x i16> %sext
 }
 
-define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_oeq_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -64,7 +50,6 @@ define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 }
 
 define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #0 {
-; Ensure sensible type legalisation
 ; VBITS_GE_256-LABEL: fcmp_oeq_v32f16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #16
@@ -98,44 +83,16 @@ define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #
   ret void
 }
 
-define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #0 {
-; VBITS_GE_256-LABEL: fcmp_oeq_v64f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #32
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z6.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z7.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.h, p0/z, z2.h, z5.h
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z1.h, z4.h
-; VBITS_GE_256-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z2.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p1.h, p0/z, z0.h, z6.h
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z3.h, z7.h
-; VBITS_GE_256-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z3.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x2, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: fcmp_oeq_v64f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
-; VBITS_GE_1024-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x2]
-; VBITS_GE_1024-NEXT:    ret
+define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcmp_oeq_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %cmp = fcmp oeq <64 x half> %op1, %op2
@@ -144,68 +101,16 @@ define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #
   ret void
 }
 
-define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) #0 {
-; VBITS_GE_256-LABEL: fcmp_oeq_v128f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #96
-; VBITS_GE_256-NEXT:    mov x9, #112
-; VBITS_GE_256-NEXT:    mov x10, #64
-; VBITS_GE_256-NEXT:    mov x11, #80
-; VBITS_GE_256-NEXT:    mov x12, #32
-; VBITS_GE_256-NEXT:    mov x13, #48
-; VBITS_GE_256-NEXT:    mov x14, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z7.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1h { z16.h }, p0/z, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z17.h }, p0/z, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z20.h }, p0/z, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z21.h }, p0/z, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z19.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z22.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z18.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z23.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.h, p0/z, z6.h, z17.h
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z5.h, z16.h
-; VBITS_GE_256-NEXT:    mov z5.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z6.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p1.h, p0/z, z4.h, z21.h
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z3.h, z20.h
-; VBITS_GE_256-NEXT:    mov z3.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z4.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p1.h, p0/z, z2.h, z22.h
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z1.h, z19.h
-; VBITS_GE_256-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z2.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p1.h, p0/z, z0.h, z18.h
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z7.h, z23.h
-; VBITS_GE_256-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z7.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x2, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x2, x11, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x2, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z6.h }, p0, [x2, x13, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z5.h }, p0, [x2, x14, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z7.h }, p0, [x2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: fcmp_oeq_v128f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
-; VBITS_GE_2048-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x2]
-; VBITS_GE_2048-NEXT:    ret
+define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcmp_oeq_v128f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %op2 = load <128 x half>, <128 x half>* %b
   %cmp = fcmp oeq <128 x half> %op1, %op2
@@ -215,7 +120,7 @@ define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_oeq_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
@@ -226,7 +131,7 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_oeq_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
@@ -236,7 +141,7 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
   ret <4 x i32> %sext
 }
 
-define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) #0 {
+define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_oeq_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -255,7 +160,6 @@ define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) #0
 }
 
 define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c) #0 {
-; Ensure sensible type legalisation
 ; VBITS_GE_256-LABEL: fcmp_oeq_v16f32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #8
@@ -289,44 +193,16 @@ define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c)
   ret void
 }
 
-define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) #0 {
-; VBITS_GE_256-LABEL: fcmp_oeq_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z2.s, z5.s
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z1.s, z4.s
-; VBITS_GE_256-NEXT:    mov z1.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z2.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z0.s, z6.s
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z3.s, z7.s
-; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x2, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: fcmp_oeq_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x2]
-; VBITS_GE_1024-NEXT:    ret
+define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcmp_oeq_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %cmp = fcmp oeq <32 x float> %op1, %op2
@@ -335,68 +211,16 @@ define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c)
   ret void
 }
 
-define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) #0 {
-; VBITS_GE_256-LABEL: fcmp_oeq_v64f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #48
-; VBITS_GE_256-NEXT:    mov x9, #56
-; VBITS_GE_256-NEXT:    mov x10, #32
-; VBITS_GE_256-NEXT:    mov x11, #40
-; VBITS_GE_256-NEXT:    mov x12, #16
-; VBITS_GE_256-NEXT:    mov x13, #24
-; VBITS_GE_256-NEXT:    mov x14, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z17.s }, p0/z, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z20.s }, p0/z, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z21.s }, p0/z, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z19.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z22.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z18.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z23.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z6.s, z17.s
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z5.s, z16.s
-; VBITS_GE_256-NEXT:    mov z5.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z6.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z4.s, z21.s
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z3.s, z20.s
-; VBITS_GE_256-NEXT:    mov z3.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z4.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z2.s, z22.s
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z1.s, z19.s
-; VBITS_GE_256-NEXT:    mov z1.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z2.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z0.s, z18.s
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z7.s, z23.s
-; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z7.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x2, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x2, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x2, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x2, x13, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x2, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: fcmp_oeq_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_2048-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x2]
-; VBITS_GE_2048-NEXT:    ret
+define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcmp_oeq_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %op2 = load <64 x float>, <64 x float>* %b
   %cmp = fcmp oeq <64 x float> %op1, %op2
@@ -406,7 +230,7 @@ define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c)
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_oeq_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcmeq d0, d0, d1
@@ -417,7 +241,7 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_oeq_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcmeq v0.2d, v0.2d, v1.2d
@@ -427,7 +251,7 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
   ret <2 x i64> %sext
 }
 
-define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) #0 {
+define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_oeq_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -446,7 +270,6 @@ define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) #
 }
 
 define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #0 {
-; Ensure sensible type legalisation
 ; VBITS_GE_256-LABEL: fcmp_oeq_v8f64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #4
@@ -480,44 +303,16 @@ define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #
   ret void
 }
 
-define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) #0 {
-; VBITS_GE_256-LABEL: fcmp_oeq_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z2.d, z5.d
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z1.d, z4.d
-; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z2.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z0.d, z6.d
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z3.d, z7.d
-; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z3.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x2, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: fcmp_oeq_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
-; VBITS_GE_1024-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x2]
-; VBITS_GE_1024-NEXT:    ret
+define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcmp_oeq_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %cmp = fcmp oeq <16 x double> %op1, %op2
@@ -526,68 +321,16 @@ define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %
   ret void
 }
 
-define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) #0 {
-; VBITS_GE_256-LABEL: fcmp_oeq_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    mov x12, #8
-; VBITS_GE_256-NEXT:    mov x13, #12
-; VBITS_GE_256-NEXT:    mov x14, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p0/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p0/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z20.d }, p0/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z21.d }, p0/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z19.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z22.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z18.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z23.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z6.d, z17.d
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z5.d, z16.d
-; VBITS_GE_256-NEXT:    mov z5.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z6.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z4.d, z21.d
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z3.d, z20.d
-; VBITS_GE_256-NEXT:    mov z3.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z4.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z2.d, z22.d
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z1.d, z19.d
-; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z2.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z0.d, z18.d
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z7.d, z23.d
-; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z7.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x2, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x2, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x2, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x2, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x2, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: fcmp_oeq_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
-; VBITS_GE_2048-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x2]
-; VBITS_GE_2048-NEXT:    ret
+define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcmp_oeq_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %op2 = load <32 x double>, <32 x double>* %b
   %cmp = fcmp oeq <32 x double> %op1, %op2
@@ -600,7 +343,7 @@ define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %
 ; FCMP UEQ
 ;
 
-define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_ueq_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -624,7 +367,7 @@ define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP ONE
 ;
 
-define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_one_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -648,7 +391,7 @@ define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP UNE
 ;
 
-define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_une_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -670,7 +413,7 @@ define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP OGT
 ;
 
-define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_ogt_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -692,7 +435,7 @@ define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP UGT
 ;
 
-define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_ugt_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -716,7 +459,7 @@ define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP OLT
 ;
 
-define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_olt_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -738,7 +481,7 @@ define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP ULT
 ;
 
-define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_ult_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -762,7 +505,7 @@ define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP OGE
 ;
 
-define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_oge_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -784,7 +527,7 @@ define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP UGE
 ;
 
-define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_uge_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -808,7 +551,7 @@ define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP OLE
 ;
 
-define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_ole_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -830,7 +573,7 @@ define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP ULE
 ;
 
-define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_ule_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -854,7 +597,7 @@ define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP UNO
 ;
 
-define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_uno_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -876,7 +619,7 @@ define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP ORD
 ;
 
-define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_ord_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -900,7 +643,7 @@ define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
 ; FCMP EQ
 ;
 
-define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_eq_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -922,7 +665,7 @@ define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
 ; FCMP NE
 ;
 
-define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_ne_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -944,7 +687,7 @@ define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
 ; FCMP GT
 ;
 
-define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_gt_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -966,7 +709,7 @@ define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
 ; FCMP LT
 ;
 
-define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_lt_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -988,7 +731,7 @@ define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
 ; FCMP GE
 ;
 
-define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_ge_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -1010,7 +753,7 @@ define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
 ; FCMP LE
 ;
 
-define void @fcmp_le_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
+define void @fcmp_le_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcmp_le_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
index 0ff987e2c461..67991510d310 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
@@ -1,21 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) #0 {
+define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v2f16_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
@@ -38,7 +24,7 @@ define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) #0 {
+define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v4f16_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -51,7 +37,7 @@ define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) #0 {
   ret void
 }
 
-define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
+define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v8f16_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -66,7 +52,6 @@ define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
 }
 
 define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: fcvt_v16f16_v16f32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #8
@@ -86,91 +71,34 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
 ; VBITS_GE_512-NEXT:    fcvt z0.s, p0/m, z0.h
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
-
   %op1 = load <16 x half>, <16 x half>* %a
   %res = fpext <16 x half> %op1 to <16 x float>
   store <16 x float> %res, <16 x float>* %b
   ret void
 }
 
-define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v32f16_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.s }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.s }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fcvt z0.s, p0/m, z0.h
-; VBITS_GE_256-NEXT:    fcvt z1.s, p0/m, z1.h
-; VBITS_GE_256-NEXT:    fcvt z2.s, p0/m, z2.h
-; VBITS_GE_256-NEXT:    fcvt z3.s, p0/m, z3.h
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    fcvt z0.s, p0/m, z0.h
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcvt_v32f16_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %res = fpext <32 x half> %op1 to <32 x float>
   store <32 x float> %res, <32 x float>* %b
   ret void
 }
 
-define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v64f16_v64f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x11, #40
-; VBITS_GE_256-NEXT:    mov x12, #32
-; VBITS_GE_256-NEXT:    mov x13, #56
-; VBITS_GE_256-NEXT:    mov x14, #48
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.s }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.s }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.s }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.s }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z5.s }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z6.s }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fcvt z0.s, p0/m, z0.h
-; VBITS_GE_256-NEXT:    fcvt z1.s, p0/m, z1.h
-; VBITS_GE_256-NEXT:    fcvt z2.s, p0/m, z2.h
-; VBITS_GE_256-NEXT:    fcvt z3.s, p0/m, z3.h
-; VBITS_GE_256-NEXT:    fcvt z4.s, p0/m, z4.h
-; VBITS_GE_256-NEXT:    fcvt z5.s, p0/m, z5.h
-; VBITS_GE_256-NEXT:    fcvt z6.s, p0/m, z6.h
-; VBITS_GE_256-NEXT:    fcvt z7.s, p0/m, z7.h
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    fcvt z0.s, p0/m, z0.h
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcvt_v64f16_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %res = fpext <64 x half> %op1 to <64 x float>
   store <64 x float> %res, <64 x float>* %b
@@ -182,7 +110,7 @@ define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) #0 {
+define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v1f16_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr h0, [x0]
@@ -196,7 +124,7 @@ define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) #0 {
 }
 
 ; v2f16 is not legal for NEON, so use SVE
-define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) #0 {
+define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v2f16_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
@@ -212,7 +140,7 @@ define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) #0 {
   ret void
 }
 
-define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
+define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v4f16_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -246,91 +174,34 @@ define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
 ; VBITS_GE_512-NEXT:    fcvt z0.d, p0/m, z0.h
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
-
   %op1 = load <8 x half>, <8 x half>* %a
   %res = fpext <8 x half> %op1 to <8 x double>
   store <8 x double> %res, <8 x double>* %b
   ret void
 }
 
-define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v16f16_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.d }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.d }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fcvt z0.d, p0/m, z0.h
-; VBITS_GE_256-NEXT:    fcvt z1.d, p0/m, z1.h
-; VBITS_GE_256-NEXT:    fcvt z2.d, p0/m, z2.h
-; VBITS_GE_256-NEXT:    fcvt z3.d, p0/m, z3.h
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1h { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    fcvt z0.d, p0/m, z0.h
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcvt_v16f16_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %res = fpext <16 x half> %op1 to <16 x double>
   store <16 x double> %res, <16 x double>* %b
   ret void
 }
 
-define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v32f16_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    mov x12, #16
-; VBITS_GE_256-NEXT:    mov x13, #28
-; VBITS_GE_256-NEXT:    mov x14, #24
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.d }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.d }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.d }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.d }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z5.d }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z6.d }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fcvt z0.d, p0/m, z0.h
-; VBITS_GE_256-NEXT:    fcvt z1.d, p0/m, z1.h
-; VBITS_GE_256-NEXT:    fcvt z2.d, p0/m, z2.h
-; VBITS_GE_256-NEXT:    fcvt z3.d, p0/m, z3.h
-; VBITS_GE_256-NEXT:    fcvt z4.d, p0/m, z4.h
-; VBITS_GE_256-NEXT:    fcvt z5.d, p0/m, z5.h
-; VBITS_GE_256-NEXT:    fcvt z6.d, p0/m, z6.h
-; VBITS_GE_256-NEXT:    fcvt z7.d, p0/m, z7.h
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    fcvt z0.d, p0/m, z0.h
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcvt_v32f16_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %res = fpext <32 x half> %op1 to <32 x double>
   store <32 x double> %res, <32 x double>* %b
@@ -342,7 +213,7 @@ define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) #0 {
+define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v1f32_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
@@ -356,7 +227,7 @@ define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) #0 {
+define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v2f32_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -369,7 +240,7 @@ define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) #0 {
   ret void
 }
 
-define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
+define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v4f32_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -384,7 +255,6 @@ define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
 }
 
 define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: fcvt_v8f32_v8f64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #4
@@ -410,84 +280,28 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v16f32_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.d }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fcvt z0.d, p0/m, z0.s
-; VBITS_GE_256-NEXT:    fcvt z1.d, p0/m, z1.s
-; VBITS_GE_256-NEXT:    fcvt z2.d, p0/m, z2.s
-; VBITS_GE_256-NEXT:    fcvt z3.d, p0/m, z3.s
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    fcvt z0.d, p0/m, z0.s
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcvt_v16f32_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %res = fpext <16 x float> %op1 to <16 x double>
   store <16 x double> %res, <16 x double>* %b
   ret void
 }
 
-define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v32f32_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    mov x12, #16
-; VBITS_GE_256-NEXT:    mov x13, #28
-; VBITS_GE_256-NEXT:    mov x14, #24
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.d }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.d }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.d }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.d }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.d }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fcvt z0.d, p0/m, z0.s
-; VBITS_GE_256-NEXT:    fcvt z1.d, p0/m, z1.s
-; VBITS_GE_256-NEXT:    fcvt z2.d, p0/m, z2.s
-; VBITS_GE_256-NEXT:    fcvt z3.d, p0/m, z3.s
-; VBITS_GE_256-NEXT:    fcvt z4.d, p0/m, z4.s
-; VBITS_GE_256-NEXT:    fcvt z5.d, p0/m, z5.s
-; VBITS_GE_256-NEXT:    fcvt z6.d, p0/m, z6.s
-; VBITS_GE_256-NEXT:    fcvt z7.d, p0/m, z7.s
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    fcvt z0.d, p0/m, z0.s
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcvt_v32f32_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %res = fpext <32 x float> %op1 to <32 x double>
   store <32 x double> %res, <32 x double>* %b
@@ -499,7 +313,7 @@ define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) #0 {
+define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v2f32_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -513,7 +327,7 @@ define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) #0 {
+define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v4f32_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -526,7 +340,7 @@ define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) #0 {
   ret void
 }
 
-define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) #0 {
+define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v8f32_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -541,7 +355,18 @@ define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) #0 {
 }
 
 define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
-; Ensure sensible type legalisation
+; VBITS_GE_256-LABEL: fcvt_v16f32_v16f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z0.s
+; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1h { z0.s }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
@@ -555,90 +380,28 @@ define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
   ret void
 }
 
-define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v32f32_v32f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z1.s
-; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z0.s
-; VBITS_GE_256-NEXT:    fcvt z2.h, p0/m, z2.s
-; VBITS_GE_256-NEXT:    fcvt z3.h, p0/m, z3.s
-; VBITS_GE_256-NEXT:    st1h { z1.s }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.s }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.s }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: fcvt_v32f32_v32f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    fcvt z0.h, p0/m, z0.s
-; VBITS_GE_1024-NEXT:    st1h { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcvt_v32f32_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %res = fptrunc <32 x float> %op1 to <32 x half>
   store <32 x half> %res, <32 x half>* %b
   ret void
 }
 
-define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v64f32_v64f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #56
-; VBITS_GE_256-NEXT:    mov x10, #48
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x11, #24
-; VBITS_GE_256-NEXT:    mov x12, #16
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    mov x13, #40
-; VBITS_GE_256-NEXT:    mov x14, #32
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z1.s
-; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z0.s
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1h { z1.s }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.s }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    movprfx z0, z5
-; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z5.s
-; VBITS_GE_256-NEXT:    movprfx z1, z4
-; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z4.s
-; VBITS_GE_256-NEXT:    st1h { z0.s }, p0, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.s }, p0, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT:    movprfx z0, z6
-; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z6.s
-; VBITS_GE_256-NEXT:    movprfx z1, z3
-; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z3.s
-; VBITS_GE_256-NEXT:    st1h { z0.s }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.s }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT:    movprfx z0, z2
-; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z2.s
-; VBITS_GE_256-NEXT:    movprfx z1, z7
-; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z7.s
-; VBITS_GE_256-NEXT:    st1h { z0.s }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: fcvt_v64f32_v64f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    fcvt z0.h, p0/m, z0.s
-; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcvt_v64f32_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %res = fptrunc <64 x float> %op1 to <64 x half>
   store <64 x half> %res, <64 x half>* %b
@@ -650,7 +413,7 @@ define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) #0 {
+define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v1f64_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -664,7 +427,7 @@ define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) #0 {
 }
 
 ; v2f16 is not legal for NEON, so use SVE
-define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) #0 {
+define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v2f64_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -680,7 +443,7 @@ define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) #0 {
   ret void
 }
 
-define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) #0 {
+define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v4f64_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -695,7 +458,6 @@ define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) #0 {
 }
 
 define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 {
-; Ensure sensible type legalisation
 ; VBITS_GE_256-LABEL: fcvt_v8f64_v8f16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #4
@@ -726,70 +488,28 @@ define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 {
   ret void
 }
 
-define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 {
-; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    fcvt z0.h, p0/m, z0.d
-; VBITS_GE_1024-NEXT:    st1h { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcvt_v16f64_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %res = fptrunc <16 x double> %op1 to <16 x half>
   store <16 x half> %res, <16 x half>* %b
   ret void
 }
 
-define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v32f64_v32f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov x11, #12
-; VBITS_GE_256-NEXT:    mov x12, #8
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x13, #20
-; VBITS_GE_256-NEXT:    mov x14, #16
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z1.d
-; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z0.d
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1h { z1.d }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.d }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    movprfx z0, z5
-; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z5.d
-; VBITS_GE_256-NEXT:    movprfx z1, z4
-; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z4.d
-; VBITS_GE_256-NEXT:    st1h { z0.d }, p0, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.d }, p0, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT:    movprfx z0, z6
-; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z6.d
-; VBITS_GE_256-NEXT:    movprfx z1, z3
-; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z3.d
-; VBITS_GE_256-NEXT:    st1h { z0.d }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.d }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT:    movprfx z0, z2
-; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z2.d
-; VBITS_GE_256-NEXT:    movprfx z1, z7
-; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z7.d
-; VBITS_GE_256-NEXT:    st1h { z0.d }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    fcvt z0.h, p0/m, z0.d
-; VBITS_GE_2048-NEXT:    st1h { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcvt_v32f64_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %res = fptrunc <32 x double> %op1 to <32 x half>
   store <32 x half> %res, <32 x half>* %b
@@ -801,7 +521,7 @@ define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) #0 {
+define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v1f64_v1f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -814,7 +534,7 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) #0 {
+define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v2f64_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
@@ -825,7 +545,7 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) #0 {
   ret void
 }
 
-define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) #0 {
+define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v4f64_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -840,7 +560,18 @@ define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) #0 {
 }
 
 define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
-; Ensure sensible type legalisation
+; VBITS_GE_256-LABEL: fcvt_v8f64_v8f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fcvt z0.s, p0/m, z0.d
+; VBITS_GE_256-NEXT:    fcvt z1.s, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -854,90 +585,28 @@ define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
   ret void
 }
 
-define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v16f64_v16f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fcvt z1.s, p0/m, z1.d
-; VBITS_GE_256-NEXT:    fcvt z0.s, p0/m, z0.d
-; VBITS_GE_256-NEXT:    fcvt z2.s, p0/m, z2.d
-; VBITS_GE_256-NEXT:    fcvt z3.s, p0/m, z3.d
-; VBITS_GE_256-NEXT:    st1w { z1.d }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.d }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    fcvt z0.s, p0/m, z0.d
-; VBITS_GE_1024-NEXT:    st1w { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: fcvt_v16f64_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
+; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %res = fptrunc <16 x double> %op1 to <16 x float>
   store <16 x float> %res, <16 x float>* %b
   ret void
 }
 
-define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: fcvt_v32f64_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov x11, #12
-; VBITS_GE_256-NEXT:    mov x12, #8
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x13, #20
-; VBITS_GE_256-NEXT:    mov x14, #16
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    fcvt z1.s, p0/m, z1.d
-; VBITS_GE_256-NEXT:    fcvt z0.s, p0/m, z0.d
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1w { z1.d }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    movprfx z0, z5
-; VBITS_GE_256-NEXT:    fcvt z0.s, p0/m, z5.d
-; VBITS_GE_256-NEXT:    movprfx z1, z4
-; VBITS_GE_256-NEXT:    fcvt z1.s, p0/m, z4.d
-; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.d }, p0, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT:    movprfx z0, z6
-; VBITS_GE_256-NEXT:    fcvt z0.s, p0/m, z6.d
-; VBITS_GE_256-NEXT:    movprfx z1, z3
-; VBITS_GE_256-NEXT:    fcvt z1.s, p0/m, z3.d
-; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.d }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    movprfx z0, z2
-; VBITS_GE_256-NEXT:    fcvt z0.s, p0/m, z2.d
-; VBITS_GE_256-NEXT:    movprfx z1, z7
-; VBITS_GE_256-NEXT:    fcvt z1.s, p0/m, z7.d
-; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    fcvt z0.s, p0/m, z0.d
-; VBITS_GE_2048-NEXT:    st1w { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: fcvt_v32f64_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
+; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %res = fptrunc <32 x double> %op1 to <32 x float>
   store <32 x float> %res, <32 x float>* %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
index b22663661796..2d0f8da1efaa 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O3 -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s
+; RUN: llc -O3 -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -O3 -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -O3 -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -8,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) #0 {
+define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmla v2.4h, v0.4h, v1.4h
@@ -20,7 +22,7 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) #0 {
+define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmla v2.8h, v0.8h, v1.8h
@@ -31,7 +33,7 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
   ret <8 x half> %res
 }
 
-define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 {
+define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -51,15 +53,31 @@ define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 {
 }
 
 define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
-; CHECK-LABEL: fma_v32f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl32
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x2]
-; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
-; CHECK-NEXT:    ret
+; VBITS_GE_256-LABEL: fma_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x2]
+; VBITS_GE_256-NEXT:    fmad z0.h, p0/m, z2.h, z4.h
+; VBITS_GE_256-NEXT:    fmad z1.h, p0/m, z3.h, z5.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fma_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x2]
+; VBITS_GE_512-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %op2 = load <32 x half>, <32 x half>* %b
   %op3 = load <32 x half>, <32 x half>* %c
@@ -69,7 +87,7 @@ define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
   ret void
 }
 
-define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 {
+define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fma_v64f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl64
@@ -88,7 +106,7 @@ define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 {
   ret void
 }
 
-define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #0 {
+define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fma_v128f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl128
@@ -108,7 +126,7 @@ define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) #0 {
+define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmla v2.2s, v0.2s, v1.2s
@@ -120,7 +138,7 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) #0 {
+define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmla v2.4s, v0.4s, v1.4s
@@ -131,7 +149,7 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
   ret <4 x float> %res
 }
 
-define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 {
+define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -151,15 +169,31 @@ define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 {
 }
 
 define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0 {
-; CHECK-LABEL: fma_v16f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl16
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2]
-; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
-; CHECK-NEXT:    ret
+; VBITS_GE_256-LABEL: fma_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x2]
+; VBITS_GE_256-NEXT:    fmad z0.s, p0/m, z2.s, z4.s
+; VBITS_GE_256-NEXT:    fmad z1.s, p0/m, z3.s, z5.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fma_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x2]
+; VBITS_GE_512-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %op2 = load <16 x float>, <16 x float>* %b
   %op3 = load <16 x float>, <16 x float>* %c
@@ -169,7 +203,7 @@ define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0
   ret void
 }
 
-define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0 {
+define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fma_v32f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl32
@@ -188,7 +222,7 @@ define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0
   ret void
 }
 
-define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0 {
+define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fma_v64f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl64
@@ -208,7 +242,7 @@ define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) #0 {
+define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmadd d0, d0, d1, d2
@@ -219,7 +253,7 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) #0 {
+define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmla v2.2d, v0.2d, v1.2d
@@ -230,7 +264,7 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
   ret <2 x double> %res
 }
 
-define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0 {
+define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fma_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -250,15 +284,31 @@ define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0
 }
 
 define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0 {
-; CHECK-LABEL: fma_v8f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl8
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
-; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
-; CHECK-NEXT:    ret
+; VBITS_GE_256-LABEL: fma_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x2]
+; VBITS_GE_256-NEXT:    fmad z0.d, p0/m, z2.d, z4.d
+; VBITS_GE_256-NEXT:    fmad z1.d, p0/m, z3.d, z5.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fma_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x2]
+; VBITS_GE_512-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %op2 = load <8 x double>, <8 x double>* %b
   %op3 = load <8 x double>, <8 x double>* %c
@@ -268,7 +318,7 @@ define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0
   ret void
 }
 
-define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) #0 {
+define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fma_v16f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl16
@@ -287,7 +337,7 @@ define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c)
   ret void
 }
 
-define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) #0 {
+define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fma_v32f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl32

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
index 0ed49e4aaaff..955169c0a130 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
@@ -1,55 +1,43 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; FMAXNM
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxnm_v4f16:
-; CHECK: fmaxnm v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxnm_v8f16:
-; CHECK: fmaxnm v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
 
-define void @fmaxnm_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fmaxnm_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxnm_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %op2 = load <16 x half>, <16 x half>* %b
   %res = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -58,26 +46,28 @@ define void @fmaxnm_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 }
 
 define void @fmaxnm_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fmaxnm_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fmaxnm [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: fmaxnm [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmaxnm_v32f16:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    mov x8, #16
+; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fmaxnm z0.h, p0/m, z0.h, z2.h
+; VBITS_EQ_256-NEXT:    fmaxnm z1.h, p0/m, z1.h, z3.h
+; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmaxnm_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %op2 = load <32 x half>, <32 x half>* %b
   %res = call <32 x half> @llvm.maxnum.v32f16(<32 x half> %op1, <32 x half> %op2)
@@ -85,14 +75,15 @@ define void @fmaxnm_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
   ret void
 }
 
-define void @fmaxnm_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fmaxnm_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmaxnm_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %res = call <64 x half> @llvm.maxnum.v64f16(<64 x half> %op1, <64 x half> %op2)
@@ -100,14 +91,15 @@ define void @fmaxnm_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
   ret void
 }
 
-define void @fmaxnm_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fmaxnm_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmaxnm_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %op2 = load <128 x half>, <128 x half>* %b
   %res = call <128 x half> @llvm.maxnum.v128f16(<128 x half> %op1, <128 x half> %op2)
@@ -116,31 +108,34 @@ define void @fmaxnm_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxnm_v2f32:
-; CHECK: fmaxnm v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxnm_v4f32:
-; CHECK: fmaxnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
 
-define void @fmaxnm_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fmaxnm_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxnm_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %op2 = load <8 x float>, <8 x float>* %b
   %res = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -149,26 +144,28 @@ define void @fmaxnm_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
 }
 
 define void @fmaxnm_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fmaxnm_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fmaxnm [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: fmaxnm [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmaxnm_v16f32:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    mov x8, #8
+; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fmaxnm z0.s, p0/m, z0.s, z2.s
+; VBITS_EQ_256-NEXT:    fmaxnm z1.s, p0/m, z1.s, z3.s
+; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmaxnm_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %op2 = load <16 x float>, <16 x float>* %b
   %res = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %op1, <16 x float> %op2)
@@ -176,14 +173,15 @@ define void @fmaxnm_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @fmaxnm_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fmaxnm_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmaxnm_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %res = call <32 x float> @llvm.maxnum.v32f32(<32 x float> %op1, <32 x float> %op2)
@@ -191,14 +189,15 @@ define void @fmaxnm_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
   ret void
 }
 
-define void @fmaxnm_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fmaxnm_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmaxnm_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %op2 = load <64 x float>, <64 x float>* %b
   %res = call <64 x float> @llvm.maxnum.v64f32(<64 x float> %op1, <64 x float> %op2)
@@ -207,31 +206,34 @@ define void @fmaxnm_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxnm_v1f64:
-; CHECK: fmaxnm d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm d0, d0, d1
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxnm_v2f64:
-; CHECK: fmaxnm v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
 
-define void @fmaxnm_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fmaxnm_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxnm_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %op2 = load <4 x double>, <4 x double>* %b
   %res = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -240,26 +242,28 @@ define void @fmaxnm_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
 }
 
 define void @fmaxnm_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fmaxnm_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fmaxnm [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: fmaxnm [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmaxnm_v8f64:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    mov x8, #4
+; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fmaxnm z0.d, p0/m, z0.d, z2.d
+; VBITS_EQ_256-NEXT:    fmaxnm z1.d, p0/m, z1.d, z3.d
+; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmaxnm_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %op2 = load <8 x double>, <8 x double>* %b
   %res = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %op1, <8 x double> %op2)
@@ -267,14 +271,15 @@ define void @fmaxnm_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @fmaxnm_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fmaxnm_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmaxnm_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %res = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %op1, <16 x double> %op2)
@@ -282,14 +287,15 @@ define void @fmaxnm_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
   ret void
 }
 
-define void @fmaxnm_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fmaxnm_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmaxnm_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %op2 = load <32 x double>, <32 x double>* %b
   %res = call <32 x double> @llvm.maxnum.v32f64(<32 x double> %op1, <32 x double> %op2)
@@ -302,31 +308,34 @@ define void @fmaxnm_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminnm_v4f16:
-; CHECK: fminnm v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminnm_v8f16:
-; CHECK: fminnm v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
 
-define void @fminnm_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fminnm_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminnm_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %op2 = load <16 x half>, <16 x half>* %b
   %res = call <16 x half> @llvm.minnum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -335,26 +344,28 @@ define void @fminnm_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 }
 
 define void @fminnm_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fminnm_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fminnm [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: fminnm [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fminnm_v32f16:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    mov x8, #16
+; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fminnm z0.h, p0/m, z0.h, z2.h
+; VBITS_EQ_256-NEXT:    fminnm z1.h, p0/m, z1.h, z3.h
+; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fminnm_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %op2 = load <32 x half>, <32 x half>* %b
   %res = call <32 x half> @llvm.minnum.v32f16(<32 x half> %op1, <32 x half> %op2)
@@ -362,14 +373,15 @@ define void @fminnm_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
   ret void
 }
 
-define void @fminnm_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fminnm_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fminnm_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %res = call <64 x half> @llvm.minnum.v64f16(<64 x half> %op1, <64 x half> %op2)
@@ -377,14 +389,15 @@ define void @fminnm_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
   ret void
 }
 
-define void @fminnm_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fminnm_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fminnm_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %op2 = load <128 x half>, <128 x half>* %b
   %res = call <128 x half> @llvm.minnum.v128f16(<128 x half> %op1, <128 x half> %op2)
@@ -393,31 +406,34 @@ define void @fminnm_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminnm_v2f32:
-; CHECK: fminnm v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminnm_v4f32:
-; CHECK: fminnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
 
-define void @fminnm_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fminnm_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminnm_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %op2 = load <8 x float>, <8 x float>* %b
   %res = call <8 x float> @llvm.minnum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -426,26 +442,28 @@ define void @fminnm_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
 }
 
 define void @fminnm_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fminnm_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fminnm [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: fminnm [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fminnm_v16f32:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    mov x8, #8
+; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fminnm z0.s, p0/m, z0.s, z2.s
+; VBITS_EQ_256-NEXT:    fminnm z1.s, p0/m, z1.s, z3.s
+; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fminnm_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %op2 = load <16 x float>, <16 x float>* %b
   %res = call <16 x float> @llvm.minnum.v16f32(<16 x float> %op1, <16 x float> %op2)
@@ -453,14 +471,15 @@ define void @fminnm_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @fminnm_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fminnm_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fminnm_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %res = call <32 x float> @llvm.minnum.v32f32(<32 x float> %op1, <32 x float> %op2)
@@ -468,14 +487,15 @@ define void @fminnm_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
   ret void
 }
 
-define void @fminnm_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fminnm_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fminnm_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %op2 = load <64 x float>, <64 x float>* %b
   %res = call <64 x float> @llvm.minnum.v64f32(<64 x float> %op1, <64 x float> %op2)
@@ -484,31 +504,34 @@ define void @fminnm_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminnm_v1f64:
-; CHECK: fminnm d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm d0, d0, d1
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminnm_v2f64:
-; CHECK: fminnm v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
 
-define void @fminnm_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fminnm_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminnm_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %op2 = load <4 x double>, <4 x double>* %b
   %res = call <4 x double> @llvm.minnum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -517,26 +540,28 @@ define void @fminnm_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
 }
 
 define void @fminnm_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fminnm_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fminnm [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: fminnm [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fminnm_v8f64:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    mov x8, #4
+; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fminnm z0.d, p0/m, z0.d, z2.d
+; VBITS_EQ_256-NEXT:    fminnm z1.d, p0/m, z1.d, z3.d
+; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fminnm_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %op2 = load <8 x double>, <8 x double>* %b
   %res = call <8 x double> @llvm.minnum.v8f64(<8 x double> %op1, <8 x double> %op2)
@@ -544,14 +569,15 @@ define void @fminnm_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @fminnm_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fminnm_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fminnm_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %res = call <16 x double> @llvm.minnum.v16f64(<16 x double> %op1, <16 x double> %op2)
@@ -559,14 +585,15 @@ define void @fminnm_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
   ret void
 }
 
-define void @fminnm_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fminnm_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fminnm_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %op2 = load <32 x double>, <32 x double>* %b
   %res = call <32 x double> @llvm.minnum.v32f64(<32 x double> %op1, <32 x double> %op2)
@@ -579,31 +606,34 @@ define void @fminnm_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmax_v4f16:
-; CHECK: fmax v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmax v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmax_v8f16:
-; CHECK: fmax v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmax v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
 
-define void @fmax_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fmax_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmax_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %op2 = load <16 x half>, <16 x half>* %b
   %res = call <16 x half> @llvm.maximum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -612,26 +642,28 @@ define void @fmax_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 }
 
 define void @fmax_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fmax_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fmax [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: fmax [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmax_v32f16:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    mov x8, #16
+; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fmax z0.h, p0/m, z0.h, z2.h
+; VBITS_EQ_256-NEXT:    fmax z1.h, p0/m, z1.h, z3.h
+; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmax_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %op2 = load <32 x half>, <32 x half>* %b
   %res = call <32 x half> @llvm.maximum.v32f16(<32 x half> %op1, <32 x half> %op2)
@@ -639,14 +671,15 @@ define void @fmax_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
   ret void
 }
 
-define void @fmax_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fmax_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmax_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %res = call <64 x half> @llvm.maximum.v64f16(<64 x half> %op1, <64 x half> %op2)
@@ -654,14 +687,15 @@ define void @fmax_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
   ret void
 }
 
-define void @fmax_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fmax_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmax_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %op2 = load <128 x half>, <128 x half>* %b
   %res = call <128 x half> @llvm.maximum.v128f16(<128 x half> %op1, <128 x half> %op2)
@@ -670,31 +704,34 @@ define void @fmax_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmax_v2f32:
-; CHECK: fmax v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmax_v4f32:
-; CHECK: fmax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
 
-define void @fmax_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fmax_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmax_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %op2 = load <8 x float>, <8 x float>* %b
   %res = call <8 x float> @llvm.maximum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -703,26 +740,28 @@ define void @fmax_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
 }
 
 define void @fmax_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fmax_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fmax [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: fmax [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmax_v16f32:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    mov x8, #8
+; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fmax z0.s, p0/m, z0.s, z2.s
+; VBITS_EQ_256-NEXT:    fmax z1.s, p0/m, z1.s, z3.s
+; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmax_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %op2 = load <16 x float>, <16 x float>* %b
   %res = call <16 x float> @llvm.maximum.v16f32(<16 x float> %op1, <16 x float> %op2)
@@ -730,14 +769,15 @@ define void @fmax_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @fmax_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fmax_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmax_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %res = call <32 x float> @llvm.maximum.v32f32(<32 x float> %op1, <32 x float> %op2)
@@ -745,14 +785,15 @@ define void @fmax_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
   ret void
 }
 
-define void @fmax_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fmax_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmax_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %op2 = load <64 x float>, <64 x float>* %b
   %res = call <64 x float> @llvm.maximum.v64f32(<64 x float> %op1, <64 x float> %op2)
@@ -761,31 +802,34 @@ define void @fmax_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmax_v1f64:
-; CHECK: fmax d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmax d0, d0, d1
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmax_v2f64:
-; CHECK: fmax v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmax v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
 
-define void @fmax_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fmax_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmax_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %op2 = load <4 x double>, <4 x double>* %b
   %res = call <4 x double> @llvm.maximum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -794,26 +838,28 @@ define void @fmax_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
 }
 
 define void @fmax_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fmax_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fmax [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: fmax [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmax_v8f64:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    mov x8, #4
+; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fmax z0.d, p0/m, z0.d, z2.d
+; VBITS_EQ_256-NEXT:    fmax z1.d, p0/m, z1.d, z3.d
+; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmax_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %op2 = load <8 x double>, <8 x double>* %b
   %res = call <8 x double> @llvm.maximum.v8f64(<8 x double> %op1, <8 x double> %op2)
@@ -821,14 +867,15 @@ define void @fmax_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @fmax_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fmax_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmax_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %res = call <16 x double> @llvm.maximum.v16f64(<16 x double> %op1, <16 x double> %op2)
@@ -836,14 +883,15 @@ define void @fmax_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
   ret void
 }
 
-define void @fmax_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fmax_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmax_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %op2 = load <32 x double>, <32 x double>* %b
   %res = call <32 x double> @llvm.maximum.v32f64(<32 x double> %op1, <32 x double> %op2)
@@ -856,31 +904,34 @@ define void @fmax_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmin_v4f16:
-; CHECK: fmin v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmin v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmin_v8f16:
-; CHECK: fmin v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmin v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
 
-define void @fmin_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @fmin_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmin_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %op2 = load <16 x half>, <16 x half>* %b
   %res = call <16 x half> @llvm.minimum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -889,26 +940,28 @@ define void @fmin_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 }
 
 define void @fmin_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; CHECK-LABEL: fmin_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fmin [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: fmin [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmin_v32f16:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    mov x8, #16
+; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fmin z0.h, p0/m, z0.h, z2.h
+; VBITS_EQ_256-NEXT:    fmin z1.h, p0/m, z1.h, z3.h
+; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmin_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %op2 = load <32 x half>, <32 x half>* %b
   %res = call <32 x half> @llvm.minimum.v32f16(<32 x half> %op1, <32 x half> %op2)
@@ -916,14 +969,15 @@ define void @fmin_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
   ret void
 }
 
-define void @fmin_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
+define void @fmin_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmin_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %res = call <64 x half> @llvm.minimum.v64f16(<64 x half> %op1, <64 x half> %op2)
@@ -931,14 +985,15 @@ define void @fmin_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
   ret void
 }
 
-define void @fmin_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
+define void @fmin_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmin_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %op2 = load <128 x half>, <128 x half>* %b
   %res = call <128 x half> @llvm.minimum.v128f16(<128 x half> %op1, <128 x half> %op2)
@@ -947,31 +1002,34 @@ define void @fmin_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmin_v2f32:
-; CHECK: fmin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmin_v4f32:
-; CHECK: fmin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
 
-define void @fmin_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @fmin_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmin_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %op2 = load <8 x float>, <8 x float>* %b
   %res = call <8 x float> @llvm.minimum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -980,26 +1038,28 @@ define void @fmin_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
 }
 
 define void @fmin_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; CHECK-LABEL: fmin_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fmin [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: fmin [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmin_v16f32:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    mov x8, #8
+; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fmin z0.s, p0/m, z0.s, z2.s
+; VBITS_EQ_256-NEXT:    fmin z1.s, p0/m, z1.s, z3.s
+; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmin_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %op2 = load <16 x float>, <16 x float>* %b
   %res = call <16 x float> @llvm.minimum.v16f32(<16 x float> %op1, <16 x float> %op2)
@@ -1007,14 +1067,15 @@ define void @fmin_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @fmin_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
+define void @fmin_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmin_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %res = call <32 x float> @llvm.minimum.v32f32(<32 x float> %op1, <32 x float> %op2)
@@ -1022,14 +1083,15 @@ define void @fmin_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
   ret void
 }
 
-define void @fmin_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
+define void @fmin_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmin_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %op2 = load <64 x float>, <64 x float>* %b
   %res = call <64 x float> @llvm.minimum.v64f32(<64 x float> %op1, <64 x float> %op2)
@@ -1038,31 +1100,34 @@ define void @fmin_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
+define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmin_v1f64:
-; CHECK: fmin d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmin d0, d0, d1
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmin_v2f64:
-; CHECK: fmin v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmin v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
 
-define void @fmin_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @fmin_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmin_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %op2 = load <4 x double>, <4 x double>* %b
   %res = call <4 x double> @llvm.minimum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -1071,26 +1136,28 @@ define void @fmin_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
 }
 
 define void @fmin_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; CHECK-LABEL: fmin_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fmin [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: fmin [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_EQ_256-LABEL: fmin_v8f64:
+; VBITS_EQ_256:       // %bb.0:
+; VBITS_EQ_256-NEXT:    mov x8, #4
+; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    fmin z0.d, p0/m, z0.d, z2.d
+; VBITS_EQ_256-NEXT:    fmin z1.d, p0/m, z1.d, z3.d
+; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_EQ_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmin_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %op2 = load <8 x double>, <8 x double>* %b
   %res = call <8 x double> @llvm.minimum.v8f64(<8 x double> %op1, <8 x double> %op2)
@@ -1098,14 +1165,15 @@ define void @fmin_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @fmin_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
+define void @fmin_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmin_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %res = call <16 x double> @llvm.minimum.v16f64(<16 x double> %op1, <16 x double> %op2)
@@ -1113,14 +1181,15 @@ define void @fmin_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
   ret void
 }
 
-define void @fmin_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
+define void @fmin_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmin_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %op2 = load <32 x double>, <32 x double>* %b
   %res = call <32 x double> @llvm.minimum.v32f64(<32 x double> %op1, <32 x double> %op2)

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
index 55cc1ebc8114..fc6fb7c85dce 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
@@ -1,243 +1,297 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; FADDA
 ;
 
 ; No single instruction NEON support. Use SVE.
-define half @fadda_v4f16(half %start, <4 x half> %a) #0 {
+define half @fadda_v4f16(half %start, <4 x half> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadda_v4f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK-NEXT: fadda h0, [[PG]], h0, z1.h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    fadda h0, p0, h0, z1.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
 
 ; No single instruction NEON support. Use SVE.
-define half @fadda_v8f16(half %start, <8 x half> %a) #0 {
+define half @fadda_v8f16(half %start, <8 x half> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadda_v8f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK-NEXT: fadda h0, [[PG]], h0, z1.h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    fadda h0, p0, h0, z1.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
 }
 
-define half @fadda_v16f16(half %start, <16 x half>* %a) #0 {
+define half @fadda_v16f16(half %start, <16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadda_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: fadda h0, [[PG]], h0, [[OP]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    fadda h0, p0, h0, z1.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
   ret half %res
 }
 
 define half @fadda_v32f16(half %start, <32 x half>* %a) #0 {
-; CHECK-LABEL: fadda_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fadda h0, [[PG]], h0, [[OP]].h
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[LO]].h
-; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[HI]].h
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fadda_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 def $z0
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fadda h0, p0, h0, z2.h
+; VBITS_GE_256-NEXT:    fadda h0, p0, h0, z1.h
+; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fadda_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 def $z0
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fadda h0, p0, h0, z1.h
+; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
   ret half %res
 }
 
-define half @fadda_v64f16(half %start, <64 x half>* %a) #0 {
+define half @fadda_v64f16(half %start, <64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fadda_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fadda h0, [[PG]], h0, [[OP]].h
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    fadda h0, p0, h0, z1.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
   ret half %res
 }
 
-define half @fadda_v128f16(half %start, <128 x half>* %a) #0 {
+define half @fadda_v128f16(half %start, <128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fadda_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fadda h0, [[PG]], h0, [[OP]].h
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    fadda h0, p0, h0, z1.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
   ret half %res
 }
 
 ; No single instruction NEON support. Use SVE.
-define float @fadda_v2f32(float %start, <2 x float> %a) #0 {
+define float @fadda_v2f32(float %start, <2 x float> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadda_v2f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK-NEXT: fadda s0, [[PG]], s0, z1.s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    fadda s0, p0, s0, z1.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
 }
 
 ; No single instruction NEON support. Use SVE.
-define float @fadda_v4f32(float %start, <4 x float> %a) #0 {
+define float @fadda_v4f32(float %start, <4 x float> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadda_v4f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK-NEXT: fadda s0, [[PG]], s0, z1.s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    fadda s0, p0, s0, z1.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
   ret float %res
 }
 
-define float @fadda_v8f32(float %start, <8 x float>* %a) #0 {
+define float @fadda_v8f32(float %start, <8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadda_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: fadda s0, [[PG]], s0, [[OP]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    fadda s0, p0, s0, z1.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
   ret float %res
 }
 
 define float @fadda_v16f32(float %start, <16 x float>* %a) #0 {
-; CHECK-LABEL: fadda_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fadda s0, [[PG]], s0, [[OP]].s
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[LO]].s
-; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[HI]].s
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fadda_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 def $z0
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fadda s0, p0, s0, z2.s
+; VBITS_GE_256-NEXT:    fadda s0, p0, s0, z1.s
+; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fadda_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 def $z0
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fadda s0, p0, s0, z1.s
+; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
   ret float %res
 }
 
-define float @fadda_v32f32(float %start, <32 x float>* %a) #0 {
+define float @fadda_v32f32(float %start, <32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fadda_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fadda s0, [[PG]], s0, [[OP]].s
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    fadda s0, p0, s0, z1.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
   ret float %res
 }
 
-define float @fadda_v64f32(float %start, <64 x float>* %a) #0 {
+define float @fadda_v64f32(float %start, <64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fadda_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fadda s0, [[PG]], s0, [[OP]].s
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    fadda s0, p0, s0, z1.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
   ret float %res
 }
 
 ; No single instruction NEON support. Use SVE.
-define double @fadda_v1f64(double %start, <1 x double> %a) #0 {
+define double @fadda_v1f64(double %start, <1 x double> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadda_v1f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: fadda d0, [[PG]], d0, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    fadda d0, p0, d0, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
   ret double %res
 }
 
 ; No single instruction NEON support. Use SVE.
-define double @fadda_v2f64(double %start, <2 x double> %a) #0 {
+define double @fadda_v2f64(double %start, <2 x double> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadda_v2f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: fadda d0, [[PG]], d0, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    fadda d0, p0, d0, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
   ret double %res
 }
 
-define double @fadda_v4f64(double %start, <4 x double>* %a) #0 {
+define double @fadda_v4f64(double %start, <4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fadda_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: fadda d0, [[PG]], d0, [[OP]].d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    fadda d0, p0, d0, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
   ret double %res
 }
 
 define double @fadda_v8f64(double %start, <8 x double>* %a) #0 {
-; CHECK-LABEL: fadda_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fadda d0, [[PG]], d0, [[OP]].d
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[LO]].d
-; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[HI]].d
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fadda_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fadda d0, p0, d0, z2.d
+; VBITS_GE_256-NEXT:    fadda d0, p0, d0, z1.d
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fadda_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fadda d0, p0, d0, z1.d
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
   ret double %res
 }
 
-define double @fadda_v16f64(double %start, <16 x double>* %a) #0 {
+define double @fadda_v16f64(double %start, <16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fadda_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fadda d0, [[PG]], d0, [[OP]].d
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    fadda d0, p0, d0, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
   ret double %res
 }
 
-define double @fadda_v32f64(double %start, <32 x double>* %a) #0 {
+define double @fadda_v32f64(double %start, <32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fadda_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fadda d0, [[PG]], d0, [[OP]].d
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    fadda d0, p0, d0, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
   ret double %res
@@ -248,236 +302,260 @@ define double @fadda_v32f64(double %start, <32 x double>* %a) #0 {
 ;
 
 ; No single instruction NEON support for 4 element vectors.
-define half @faddv_v4f16(half %start, <4 x half> %a) #0 {
+define half @faddv_v4f16(half %start, <4 x half> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: faddv_v4f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], z1.h
-; CHECK-NEXT: fadd h0, h0, [[RDX]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    faddv h1, p0, z1.h
+; CHECK-NEXT:    fadd h0, h0, h1
+; CHECK-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
 
 ; No single instruction NEON support for 8 element vectors.
-define half @faddv_v8f16(half %start, <8 x half> %a) #0 {
+define half @faddv_v8f16(half %start, <8 x half> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: faddv_v8f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], z1.h
-; CHECK-NEXT: fadd h0, h0, [[RDX]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    faddv h1, p0, z1.h
+; CHECK-NEXT:    fadd h0, h0, h1
+; CHECK-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
 }
 
-define half @faddv_v16f16(half %start, <16 x half>* %a) #0 {
+define half @faddv_v16f16(half %start, <16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: faddv_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fadd h0, h0, [[RDX]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    faddv h1, p0, z1.h
+; CHECK-NEXT:    fadd h0, h0, h1
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
   ret half %res
 }
 
 define half @faddv_v32f16(half %start, <32 x half>* %a) #0 {
-; CHECK-LABEL: faddv_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fadd h0, h0, [[RDX]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: faddv h1, [[PG]], [[ADD]].h
-; VBITS_EQ_256-DAG: fadd h0, h0, [[RDX]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: faddv_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fadd z1.h, p0/m, z1.h, z2.h
+; VBITS_GE_256-NEXT:    faddv h1, p0, z1.h
+; VBITS_GE_256-NEXT:    fadd h0, h0, h1
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: faddv_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    faddv h1, p0, z1.h
+; VBITS_GE_512-NEXT:    fadd h0, h0, h1
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call fast half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
   ret half %res
 }
 
-define half @faddv_v64f16(half %start, <64 x half>* %a) #0 {
+define half @faddv_v64f16(half %start, <64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: faddv_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fadd h0, h0, [[RDX]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    faddv h1, p0, z1.h
+; CHECK-NEXT:    fadd h0, h0, h1
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call fast half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
   ret half %res
 }
 
-define half @faddv_v128f16(half %start, <128 x half>* %a) #0 {
+define half @faddv_v128f16(half %start, <128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: faddv_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fadd h0, h0, [[RDX]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    faddv h1, p0, z1.h
+; CHECK-NEXT:    fadd h0, h0, h1
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call fast half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
   ret half %res
 }
 
 ; Don't use SVE for 2 element vectors.
-define float @faddv_v2f32(float %start, <2 x float> %a) #0 {
+define float @faddv_v2f32(float %start, <2 x float> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: faddv_v2f32:
-; CHECK: faddp s1, v1.2s
-; CHECK-NEXT: fadd s0, s0, s1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    faddp s1, v1.2s
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
 }
 
 ; No single instruction NEON support for 4 element vectors.
-define float @faddv_v4f32(float %start, <4 x float> %a) #0 {
+define float @faddv_v4f32(float %start, <4 x float> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: faddv_v4f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], z1.s
-; CHECK-NEXT: fadd s0, s0, [[RDX]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
   ret float %res
 }
 
-define float @faddv_v8f32(float %start, <8 x float>* %a) #0 {
+define float @faddv_v8f32(float %start, <8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: faddv_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fadd s0, s0, [[RDX]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
   ret float %res
 }
 
 define float @faddv_v16f32(float %start, <16 x float>* %a) #0 {
-; CHECK-LABEL: faddv_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fadd s0, s0, [[RDX]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: faddv [[RDX:s[0-9]+]], [[PG]], [[ADD]].s
-; VBITS_EQ_256-DAG: fadd s0, s0, [[RDX]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: faddv_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fadd z1.s, p0/m, z1.s, z2.s
+; VBITS_GE_256-NEXT:    faddv s1, p0, z1.s
+; VBITS_GE_256-NEXT:    fadd s0, s0, s1
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: faddv_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    faddv s1, p0, z1.s
+; VBITS_GE_512-NEXT:    fadd s0, s0, s1
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call fast float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
   ret float %res
 }
 
-define float @faddv_v32f32(float %start, <32 x float>* %a) #0 {
+define float @faddv_v32f32(float %start, <32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: faddv_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fadd s0, s0, [[RDX]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call fast float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
   ret float %res
 }
 
-define float @faddv_v64f32(float %start, <64 x float>* %a) #0 {
+define float @faddv_v64f32(float %start, <64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: faddv_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fadd s0, s0, [[RDX]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call fast float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
   ret float %res
 }
 
 ; Don't use SVE for 1 element vectors.
-define double @faddv_v1f64(double %start, <1 x double> %a) #0 {
+define double @faddv_v1f64(double %start, <1 x double> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: faddv_v1f64:
-; CHECK: fadd d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    ret
   %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
   ret double %res
 }
 
 ; Don't use SVE for 2 element vectors.
-define double @faddv_v2f64(double %start, <2 x double> %a) #0 {
+define double @faddv_v2f64(double %start, <2 x double> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: faddv_v2f64:
-; CHECK: faddp d1, v1.2d
-; CHECK-NEXT: fadd d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    faddp d1, v1.2d
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    ret
   %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
   ret double %res
 }
 
-define double @faddv_v4f64(double %start, <4 x double>* %a) #0 {
+define double @faddv_v4f64(double %start, <4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: faddv_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fadd d0, d0, [[RDX]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    faddv d1, p0, z1.d
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
   ret double %res
 }
 
 define double @faddv_v8f64(double %start, <8 x double>* %a) #0 {
-; CHECK-LABEL: faddv_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fadd d0, d0, [[RDX]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: faddv [[RDX:d[0-9]+]], [[PG]], [[ADD]].d
-; VBITS_EQ_256-DAG: fadd d0, d0, [[RDX]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: faddv_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fadd z1.d, p0/m, z1.d, z2.d
+; VBITS_GE_256-NEXT:    faddv d1, p0, z1.d
+; VBITS_GE_256-NEXT:    fadd d0, d0, d1
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: faddv_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    faddv d1, p0, z1.d
+; VBITS_GE_512-NEXT:    fadd d0, d0, d1
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call fast double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
   ret double %res
 }
 
-define double @faddv_v16f64(double %start, <16 x double>* %a) #0 {
+define double @faddv_v16f64(double %start, <16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: faddv_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fadd d0, d0, [[RDX]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    faddv d1, p0, z1.d
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call fast double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
   ret double %res
 }
 
-define double @faddv_v32f64(double %start, <32 x double>* %a) #0 {
+define double @faddv_v32f64(double %start, <32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: faddv_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fadd d0, d0, [[RDX]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    faddv d1, p0, z1.d
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call fast double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
   ret double %res
@@ -488,213 +566,248 @@ define double @faddv_v32f64(double %start, <32 x double>* %a) #0 {
 ;
 
 ; No NEON 16-bit vector FMAXNMV support. Use SVE.
-define half @fmaxv_v4f16(<4 x half> %a) #0 {
+define half @fmaxv_v4f16(<4 x half> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxv_v4f16:
-; CHECK: fmaxnmv h0, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnmv h0, v0.4h
+; CHECK-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
   ret half %res
 }
 
 ; No NEON 16-bit vector FMAXNMV support. Use SVE.
-define half @fmaxv_v8f16(<8 x half> %a) #0 {
+define half @fmaxv_v8f16(<8 x half> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxv_v8f16:
-; CHECK: fmaxnmv h0, v0.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnmv h0, v0.8h
+; CHECK-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
   ret half %res
 }
 
-define half @fmaxv_v16f16(<16 x half>* %a) #0 {
+define half @fmaxv_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxv_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
   ret half %res
 }
 
 define half @fmaxv_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: fmaxv_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: fmaxnmv h0, [[PG]], [[MAX]].h
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fmaxv_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT:    fmaxnmv h0, p0, z0.h
+; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmaxv_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fmaxnmv h0, p0, z0.h
+; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %op)
   ret half %res
 }
 
-define half @fmaxv_v64f16(<64 x half>* %a) #0 {
+define half @fmaxv_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmaxv_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call half @llvm.vector.reduce.fmax.v64f16(<64 x half> %op)
   ret half %res
 }
 
-define half @fmaxv_v128f16(<128 x half>* %a) #0 {
+define half @fmaxv_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmaxv_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %op)
   ret half %res
 }
 
 ; Don't use SVE for 64-bit f32 vectors.
-define float @fmaxv_v2f32(<2 x float> %a) #0 {
+define float @fmaxv_v2f32(<2 x float> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxv_v2f32:
-; CHECK: fmaxnmp s0, v0.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnmp s0, v0.2s
+; CHECK-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
   ret float %res
 }
 
 ; Don't use SVE for 128-bit f32 vectors.
-define float @fmaxv_v4f32(<4 x float> %a) #0 {
+define float @fmaxv_v4f32(<4 x float> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxv_v4f32:
-; CHECK: fmaxnmv s0, v0.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   ret float %res
 }
 
-define float @fmaxv_v8f32(<8 x float>* %a) #0 {
+define float @fmaxv_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxv_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
   ret float %res
 }
 
 define float @fmaxv_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: fmaxv_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: fmaxnmv s0, [[PG]], [[MAX]].s
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fmaxv_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    fmaxnmv s0, p0, z0.s
+; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmaxv_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fmaxnmv s0, p0, z0.s
+; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %op)
   ret float %res
 }
 
-define float @fmaxv_v32f32(<32 x float>* %a) #0 {
+define float @fmaxv_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmaxv_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %op)
   ret float %res
 }
 
-define float @fmaxv_v64f32(<64 x float>* %a) #0 {
+define float @fmaxv_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmaxv_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call float @llvm.vector.reduce.fmax.v64f32(<64 x float> %op)
   ret float %res
 }
 
 ; Nothing to do for single element vectors.
-define double @fmaxv_v1f64(<1 x double> %a) #0 {
+define double @fmaxv_v1f64(<1 x double> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxv_v1f64:
-; CHECK-NOT: fmax
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
   ret double %res
 }
 
 ; Don't use SVE for 128-bit f64 vectors.
-define double @fmaxv_v2f64(<2 x double> %a) #0 {
+define double @fmaxv_v2f64(<2 x double> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxv_v2f64:
-; CHECK: fmaxnmp d0, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnmp d0, v0.2d
+; CHECK-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
   ret double %res
 }
 
-define double @fmaxv_v4f64(<4 x double>* %a) #0 {
+define double @fmaxv_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fmaxv_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
   ret double %res
 }
 
 define double @fmaxv_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: fmaxv_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: fmaxnmv d0, [[PG]], [[MAX]].d
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fmaxv_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_256-NEXT:    fmaxnmv d0, p0, z0.d
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fmaxv_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fmaxnmv d0, p0, z0.d
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %op)
   ret double %res
 }
 
-define double @fmaxv_v16f64(<16 x double>* %a) #0 {
+define double @fmaxv_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fmaxv_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %op)
   ret double %res
 }
 
-define double @fmaxv_v32f64(<32 x double>* %a) #0 {
+define double @fmaxv_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fmaxv_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %op)
   ret double %res
@@ -705,213 +818,248 @@ define double @fmaxv_v32f64(<32 x double>* %a) #0 {
 ;
 
 ; No NEON 16-bit vector FMINNMV support. Use SVE.
-define half @fminv_v4f16(<4 x half> %a) #0 {
+define half @fminv_v4f16(<4 x half> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminv_v4f16:
-; CHECK: fminnmv h0, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnmv h0, v0.4h
+; CHECK-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
   ret half %res
 }
 
 ; No NEON 16-bit vector FMINNMV support. Use SVE.
-define half @fminv_v8f16(<8 x half> %a) #0 {
+define half @fminv_v8f16(<8 x half> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminv_v8f16:
-; CHECK: fminnmv h0, v0.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnmv h0, v0.8h
+; CHECK-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
   ret half %res
 }
 
-define half @fminv_v16f16(<16 x half>* %a) #0 {
+define half @fminv_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminv_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: fminnmv h0, [[PG]], [[OP]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fminnmv h0, p0, z0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
   ret half %res
 }
 
 define half @fminv_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: fminv_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fminnmv h0, [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: fminnmv h0, [[PG]], [[MIN]].h
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fminv_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT:    fminnmv h0, p0, z0.h
+; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fminv_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fminnmv h0, p0, z0.h
+; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %op)
   ret half %res
 }
 
-define half @fminv_v64f16(<64 x half>* %a) #0 {
+define half @fminv_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fminv_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fminnmv h0, [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fminnmv h0, p0, z0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call half @llvm.vector.reduce.fmin.v64f16(<64 x half> %op)
   ret half %res
 }
 
-define half @fminv_v128f16(<128 x half>* %a) #0 {
+define half @fminv_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fminv_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fminnmv h0, [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fminnmv h0, p0, z0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %op)
   ret half %res
 }
 
 ; Don't use SVE for 64-bit f32 vectors.
-define float @fminv_v2f32(<2 x float> %a) #0 {
+define float @fminv_v2f32(<2 x float> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminv_v2f32:
-; CHECK: fminnmp s0, v0.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnmp s0, v0.2s
+; CHECK-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
   ret float %res
 }
 
 ; Don't use SVE for 128-bit f32 vectors.
-define float @fminv_v4f32(<4 x float> %a) #0 {
+define float @fminv_v4f32(<4 x float> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminv_v4f32:
-; CHECK: fminnmv s0, v0.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnmv s0, v0.4s
+; CHECK-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
   ret float %res
 }
 
-define float @fminv_v8f32(<8 x float>* %a) #0 {
+define float @fminv_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminv_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: fminnmv s0, [[PG]], [[OP]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fminnmv s0, p0, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
   ret float %res
 }
 
 define float @fminv_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: fminv_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fminnmv s0, [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: fminnmv s0, [[PG]], [[MIN]].s
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fminv_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    fminnmv s0, p0, z0.s
+; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fminv_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fminnmv s0, p0, z0.s
+; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %op)
   ret float %res
 }
 
-define float @fminv_v32f32(<32 x float>* %a) #0 {
+define float @fminv_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fminv_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fminnmv s0, [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fminnmv s0, p0, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %op)
   ret float %res
 }
 
-define float @fminv_v64f32(<64 x float>* %a) #0 {
+define float @fminv_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fminv_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fminnmv s0, [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fminnmv s0, p0, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call float @llvm.vector.reduce.fmin.v64f32(<64 x float> %op)
   ret float %res
 }
 
 ; Nothing to do for single element vectors.
-define double @fminv_v1f64(<1 x double> %a) #0 {
+define double @fminv_v1f64(<1 x double> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminv_v1f64:
-; CHECK-NOT: fmin
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
   ret double %res
 }
 
 ; Don't use SVE for 128-bit f64 vectors.
-define double @fminv_v2f64(<2 x double> %a) #0 {
+define double @fminv_v2f64(<2 x double> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminv_v2f64:
-; CHECK: fminnmp d0, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnmp d0, v0.2d
+; CHECK-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
   ret double %res
 }
 
-define double @fminv_v4f64(<4 x double>* %a) #0 {
+define double @fminv_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fminv_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: fminnmv d0, [[PG]], [[OP]].d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fminnmv d0, p0, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
   ret double %res
 }
 
 define double @fminv_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: fminv_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fminnmv d0, [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: fminnmv d0, [[PG]], [[MIN]].d
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fminv_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_256-NEXT:    fminnmv d0, p0, z0.d
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fminv_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fminnmv d0, p0, z0.d
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %op)
   ret double %res
 }
 
-define double @fminv_v16f64(<16 x double>* %a) #0 {
+define double @fminv_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fminv_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fminnmv d0, [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fminnmv d0, p0, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %op)
   ret double %res
 }
 
-define double @fminv_v32f64(<32 x double>* %a) #0 {
+define double @fminv_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fminv_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fminnmv d0, [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fminnmv d0, p0, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %op)
   ret double %res

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
index e9a0c3658a3d..0f5afa5b17b3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
@@ -1,54 +1,42 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; CEIL -> FRINTP
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @frintp_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frintp_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintp_v4f16:
-; CHECK: frintp v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintp v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.ceil.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @frintp_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frintp_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintp_v8f16:
-; CHECK: frintp v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintp v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.ceil.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
 
-define void @frintp_v16f16(<16 x half>* %a) #0 {
+define void @frintp_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintp_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintp [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call <16 x half> @llvm.ceil.v16f16(<16 x half> %op)
   store <16 x half> %res, <16 x half>* %a
@@ -56,49 +44,53 @@ define void @frintp_v16f16(<16 x half>* %a) #0 {
 }
 
 define void @frintp_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frintp_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintp [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frintp [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frintp [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintp_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintp z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    frintp z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintp_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintp z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call <32 x half> @llvm.ceil.v32f16(<32 x half> %op)
   store <32 x half> %res, <32 x half>* %a
   ret void
 }
 
-define void @frintp_v64f16(<64 x half>* %a) #0 {
+define void @frintp_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintp_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintp [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call <64 x half> @llvm.ceil.v64f16(<64 x half> %op)
   store <64 x half> %res, <64 x half>* %a
   ret void
 }
 
-define void @frintp_v128f16(<128 x half>* %a) #0 {
+define void @frintp_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintp_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintp [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call <128 x half> @llvm.ceil.v128f16(<128 x half> %op)
   store <128 x half> %res, <128 x half>* %a
@@ -106,30 +98,33 @@ define void @frintp_v128f16(<128 x half>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @frintp_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frintp_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintp_v2f32:
-; CHECK: frintp v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintp v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.ceil.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @frintp_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frintp_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintp_v4f32:
-; CHECK: frintp v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintp v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
 
-define void @frintp_v8f32(<8 x float>* %a) #0 {
+define void @frintp_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintp_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintp [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call <8 x float> @llvm.ceil.v8f32(<8 x float> %op)
   store <8 x float> %res, <8 x float>* %a
@@ -137,49 +132,53 @@ define void @frintp_v8f32(<8 x float>* %a) #0 {
 }
 
 define void @frintp_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frintp_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintp [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frintp [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frintp [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintp_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintp z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    frintp z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintp_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintp z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call <16 x float> @llvm.ceil.v16f32(<16 x float> %op)
   store <16 x float> %res, <16 x float>* %a
   ret void
 }
 
-define void @frintp_v32f32(<32 x float>* %a) #0 {
+define void @frintp_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintp_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintp [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call <32 x float> @llvm.ceil.v32f32(<32 x float> %op)
   store <32 x float> %res, <32 x float>* %a
   ret void
 }
 
-define void @frintp_v64f32(<64 x float>* %a) #0 {
+define void @frintp_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintp_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintp [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call <64 x float> @llvm.ceil.v64f32(<64 x float> %op)
   store <64 x float> %res, <64 x float>* %a
@@ -187,30 +186,33 @@ define void @frintp_v64f32(<64 x float>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @frintp_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frintp_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintp_v1f64:
-; CHECK: frintp d0, d0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintp d0, d0
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.ceil.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @frintp_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frintp_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintp_v2f64:
-; CHECK: frintp v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintp v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.ceil.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
 
-define void @frintp_v4f64(<4 x double>* %a) #0 {
+define void @frintp_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintp_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintp [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintp z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call <4 x double> @llvm.ceil.v4f64(<4 x double> %op)
   store <4 x double> %res, <4 x double>* %a
@@ -218,49 +220,53 @@ define void @frintp_v4f64(<4 x double>* %a) #0 {
 }
 
 define void @frintp_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frintp_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintp [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frintp [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frintp [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintp_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintp z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    frintp z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintp_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintp z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call <8 x double> @llvm.ceil.v8f64(<8 x double> %op)
   store <8 x double> %res, <8 x double>* %a
   ret void
 }
 
-define void @frintp_v16f64(<16 x double>* %a) #0 {
+define void @frintp_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintp_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintp [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintp z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call <16 x double> @llvm.ceil.v16f64(<16 x double> %op)
   store <16 x double> %res, <16 x double>* %a
   ret void
 }
 
-define void @frintp_v32f64(<32 x double>* %a) #0 {
+define void @frintp_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintp_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintp [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintp z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call <32 x double> @llvm.ceil.v32f64(<32 x double> %op)
   store <32 x double> %res, <32 x double>* %a
@@ -272,30 +278,33 @@ define void @frintp_v32f64(<32 x double>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @frintm_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frintm_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintm_v4f16:
-; CHECK: frintm v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintm v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.floor.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @frintm_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frintm_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintm_v8f16:
-; CHECK: frintm v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintm v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.floor.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
 
-define void @frintm_v16f16(<16 x half>* %a) #0 {
+define void @frintm_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintm_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call <16 x half> @llvm.floor.v16f16(<16 x half> %op)
   store <16 x half> %res, <16 x half>* %a
@@ -303,49 +312,53 @@ define void @frintm_v16f16(<16 x half>* %a) #0 {
 }
 
 define void @frintm_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frintm_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frintm [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frintm [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintm_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintm z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    frintm z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintm_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintm z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call <32 x half> @llvm.floor.v32f16(<32 x half> %op)
   store <32 x half> %res, <32 x half>* %a
   ret void
 }
 
-define void @frintm_v64f16(<64 x half>* %a) #0 {
+define void @frintm_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintm_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call <64 x half> @llvm.floor.v64f16(<64 x half> %op)
   store <64 x half> %res, <64 x half>* %a
   ret void
 }
 
-define void @frintm_v128f16(<128 x half>* %a) #0 {
+define void @frintm_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintm_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call <128 x half> @llvm.floor.v128f16(<128 x half> %op)
   store <128 x half> %res, <128 x half>* %a
@@ -353,30 +366,33 @@ define void @frintm_v128f16(<128 x half>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @frintm_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frintm_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintm_v2f32:
-; CHECK: frintm v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintm v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.floor.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @frintm_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frintm_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintm_v4f32:
-; CHECK: frintm v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintm v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
 
-define void @frintm_v8f32(<8 x float>* %a) #0 {
+define void @frintm_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintm_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call <8 x float> @llvm.floor.v8f32(<8 x float> %op)
   store <8 x float> %res, <8 x float>* %a
@@ -384,49 +400,53 @@ define void @frintm_v8f32(<8 x float>* %a) #0 {
 }
 
 define void @frintm_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frintm_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frintm [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frintm [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintm_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintm z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    frintm z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintm_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintm z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call <16 x float> @llvm.floor.v16f32(<16 x float> %op)
   store <16 x float> %res, <16 x float>* %a
   ret void
 }
 
-define void @frintm_v32f32(<32 x float>* %a) #0 {
+define void @frintm_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintm_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call <32 x float> @llvm.floor.v32f32(<32 x float> %op)
   store <32 x float> %res, <32 x float>* %a
   ret void
 }
 
-define void @frintm_v64f32(<64 x float>* %a) #0 {
+define void @frintm_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintm_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call <64 x float> @llvm.floor.v64f32(<64 x float> %op)
   store <64 x float> %res, <64 x float>* %a
@@ -434,30 +454,33 @@ define void @frintm_v64f32(<64 x float>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @frintm_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frintm_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintm_v1f64:
-; CHECK: frintm d0, d0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintm d0, d0
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.floor.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @frintm_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frintm_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintm_v2f64:
-; CHECK: frintm v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintm v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.floor.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
 
-define void @frintm_v4f64(<4 x double>* %a) #0 {
+define void @frintm_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintm_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintm z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call <4 x double> @llvm.floor.v4f64(<4 x double> %op)
   store <4 x double> %res, <4 x double>* %a
@@ -465,49 +488,53 @@ define void @frintm_v4f64(<4 x double>* %a) #0 {
 }
 
 define void @frintm_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frintm_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frintm [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frintm [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintm_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintm z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    frintm z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintm_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintm z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call <8 x double> @llvm.floor.v8f64(<8 x double> %op)
   store <8 x double> %res, <8 x double>* %a
   ret void
 }
 
-define void @frintm_v16f64(<16 x double>* %a) #0 {
+define void @frintm_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintm_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintm z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call <16 x double> @llvm.floor.v16f64(<16 x double> %op)
   store <16 x double> %res, <16 x double>* %a
   ret void
 }
 
-define void @frintm_v32f64(<32 x double>* %a) #0 {
+define void @frintm_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintm_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintm z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call <32 x double> @llvm.floor.v32f64(<32 x double> %op)
   store <32 x double> %res, <32 x double>* %a
@@ -519,30 +546,33 @@ define void @frintm_v32f64(<32 x double>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @frinti_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frinti_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinti_v4f16:
-; CHECK: frinti v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinti v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @frinti_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frinti_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinti_v8f16:
-; CHECK: frinti v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinti v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
 
-define void @frinti_v16f16(<16 x half>* %a) #0 {
+define void @frinti_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinti_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frinti [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %op)
   store <16 x half> %res, <16 x half>* %a
@@ -550,49 +580,53 @@ define void @frinti_v16f16(<16 x half>* %a) #0 {
 }
 
 define void @frinti_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frinti_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frinti [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frinti [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frinti [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frinti_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frinti z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    frinti z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frinti_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frinti z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call <32 x half> @llvm.nearbyint.v32f16(<32 x half> %op)
   store <32 x half> %res, <32 x half>* %a
   ret void
 }
 
-define void @frinti_v64f16(<64 x half>* %a) #0 {
+define void @frinti_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frinti_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frinti [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call <64 x half> @llvm.nearbyint.v64f16(<64 x half> %op)
   store <64 x half> %res, <64 x half>* %a
   ret void
 }
 
-define void @frinti_v128f16(<128 x half>* %a) #0 {
+define void @frinti_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frinti_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frinti [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call <128 x half> @llvm.nearbyint.v128f16(<128 x half> %op)
   store <128 x half> %res, <128 x half>* %a
@@ -600,30 +634,33 @@ define void @frinti_v128f16(<128 x half>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @frinti_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frinti_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinti_v2f32:
-; CHECK: frinti v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinti v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @frinti_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frinti_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinti_v4f32:
-; CHECK: frinti v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinti v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
 
-define void @frinti_v8f32(<8 x float>* %a) #0 {
+define void @frinti_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinti_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frinti [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %op)
   store <8 x float> %res, <8 x float>* %a
@@ -631,49 +668,53 @@ define void @frinti_v8f32(<8 x float>* %a) #0 {
 }
 
 define void @frinti_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frinti_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frinti [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frinti [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frinti [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frinti_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frinti z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    frinti z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frinti_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frinti z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %op)
   store <16 x float> %res, <16 x float>* %a
   ret void
 }
 
-define void @frinti_v32f32(<32 x float>* %a) #0 {
+define void @frinti_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frinti_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frinti [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call <32 x float> @llvm.nearbyint.v32f32(<32 x float> %op)
   store <32 x float> %res, <32 x float>* %a
   ret void
 }
 
-define void @frinti_v64f32(<64 x float>* %a) #0 {
+define void @frinti_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frinti_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frinti [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call <64 x float> @llvm.nearbyint.v64f32(<64 x float> %op)
   store <64 x float> %res, <64 x float>* %a
@@ -681,30 +722,33 @@ define void @frinti_v64f32(<64 x float>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @frinti_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frinti_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinti_v1f64:
-; CHECK: frinti d0, d0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinti d0, d0
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @frinti_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frinti_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinti_v2f64:
-; CHECK: frinti v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinti v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
 
-define void @frinti_v4f64(<4 x double>* %a) #0 {
+define void @frinti_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinti_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frinti [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frinti z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %op)
   store <4 x double> %res, <4 x double>* %a
@@ -712,49 +756,53 @@ define void @frinti_v4f64(<4 x double>* %a) #0 {
 }
 
 define void @frinti_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frinti_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frinti [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frinti [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frinti [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frinti_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frinti z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    frinti z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frinti_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frinti z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %op)
   store <8 x double> %res, <8 x double>* %a
   ret void
 }
 
-define void @frinti_v16f64(<16 x double>* %a) #0 {
+define void @frinti_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frinti_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frinti [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frinti z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call <16 x double> @llvm.nearbyint.v16f64(<16 x double> %op)
   store <16 x double> %res, <16 x double>* %a
   ret void
 }
 
-define void @frinti_v32f64(<32 x double>* %a) #0 {
+define void @frinti_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frinti_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frinti [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frinti z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call <32 x double> @llvm.nearbyint.v32f64(<32 x double> %op)
   store <32 x double> %res, <32 x double>* %a
@@ -766,30 +814,33 @@ define void @frinti_v32f64(<32 x double>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @frintx_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frintx_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintx_v4f16:
-; CHECK: frintx v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.rint.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @frintx_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frintx_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintx_v8f16:
-; CHECK: frintx v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.rint.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
 
-define void @frintx_v16f16(<16 x half>* %a) #0 {
+define void @frintx_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintx_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintx [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call <16 x half> @llvm.rint.v16f16(<16 x half> %op)
   store <16 x half> %res, <16 x half>* %a
@@ -797,49 +848,53 @@ define void @frintx_v16f16(<16 x half>* %a) #0 {
 }
 
 define void @frintx_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frintx_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintx [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frintx [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frintx [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintx_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintx z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    frintx z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintx_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintx z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call <32 x half> @llvm.rint.v32f16(<32 x half> %op)
   store <32 x half> %res, <32 x half>* %a
   ret void
 }
 
-define void @frintx_v64f16(<64 x half>* %a) #0 {
+define void @frintx_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintx_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintx [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call <64 x half> @llvm.rint.v64f16(<64 x half> %op)
   store <64 x half> %res, <64 x half>* %a
   ret void
 }
 
-define void @frintx_v128f16(<128 x half>* %a) #0 {
+define void @frintx_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintx_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintx [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call <128 x half> @llvm.rint.v128f16(<128 x half> %op)
   store <128 x half> %res, <128 x half>* %a
@@ -847,30 +902,33 @@ define void @frintx_v128f16(<128 x half>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @frintx_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frintx_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintx_v2f32:
-; CHECK: frintx v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.rint.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @frintx_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frintx_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintx_v4f32:
-; CHECK: frintx v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
 
-define void @frintx_v8f32(<8 x float>* %a) #0 {
+define void @frintx_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintx_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintx [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call <8 x float> @llvm.rint.v8f32(<8 x float> %op)
   store <8 x float> %res, <8 x float>* %a
@@ -878,49 +936,53 @@ define void @frintx_v8f32(<8 x float>* %a) #0 {
 }
 
 define void @frintx_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frintx_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintx [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frintx [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frintx [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintx_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintx z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    frintx z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintx_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintx z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call <16 x float> @llvm.rint.v16f32(<16 x float> %op)
   store <16 x float> %res, <16 x float>* %a
   ret void
 }
 
-define void @frintx_v32f32(<32 x float>* %a) #0 {
+define void @frintx_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintx_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintx [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call <32 x float> @llvm.rint.v32f32(<32 x float> %op)
   store <32 x float> %res, <32 x float>* %a
   ret void
 }
 
-define void @frintx_v64f32(<64 x float>* %a) #0 {
+define void @frintx_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintx_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintx [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call <64 x float> @llvm.rint.v64f32(<64 x float> %op)
   store <64 x float> %res, <64 x float>* %a
@@ -928,30 +990,33 @@ define void @frintx_v64f32(<64 x float>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @frintx_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frintx_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintx_v1f64:
-; CHECK: frintx d0, d0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx d0, d0
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.rint.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @frintx_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frintx_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintx_v2f64:
-; CHECK: frintx v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.rint.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
 
-define void @frintx_v4f64(<4 x double>* %a) #0 {
+define void @frintx_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintx_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintx [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call <4 x double> @llvm.rint.v4f64(<4 x double> %op)
   store <4 x double> %res, <4 x double>* %a
@@ -959,49 +1024,53 @@ define void @frintx_v4f64(<4 x double>* %a) #0 {
 }
 
 define void @frintx_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frintx_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintx [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frintx [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frintx [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintx_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintx z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    frintx z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintx_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintx z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call <8 x double> @llvm.rint.v8f64(<8 x double> %op)
   store <8 x double> %res, <8 x double>* %a
   ret void
 }
 
-define void @frintx_v16f64(<16 x double>* %a) #0 {
+define void @frintx_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintx_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintx [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call <16 x double> @llvm.rint.v16f64(<16 x double> %op)
   store <16 x double> %res, <16 x double>* %a
   ret void
 }
 
-define void @frintx_v32f64(<32 x double>* %a) #0 {
+define void @frintx_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintx_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintx [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call <32 x double> @llvm.rint.v32f64(<32 x double> %op)
   store <32 x double> %res, <32 x double>* %a
@@ -1013,30 +1082,33 @@ define void @frintx_v32f64(<32 x double>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @frinta_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frinta_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinta_v4f16:
-; CHECK: frinta v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinta v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.round.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @frinta_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frinta_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinta_v8f16:
-; CHECK: frinta v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinta v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.round.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
 
-define void @frinta_v16f16(<16 x half>* %a) #0 {
+define void @frinta_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinta_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frinta [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call <16 x half> @llvm.round.v16f16(<16 x half> %op)
   store <16 x half> %res, <16 x half>* %a
@@ -1044,49 +1116,53 @@ define void @frinta_v16f16(<16 x half>* %a) #0 {
 }
 
 define void @frinta_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frinta_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frinta [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frinta [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frinta [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frinta_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frinta z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    frinta z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frinta_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frinta z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call <32 x half> @llvm.round.v32f16(<32 x half> %op)
   store <32 x half> %res, <32 x half>* %a
   ret void
 }
 
-define void @frinta_v64f16(<64 x half>* %a) #0 {
+define void @frinta_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frinta_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frinta [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call <64 x half> @llvm.round.v64f16(<64 x half> %op)
   store <64 x half> %res, <64 x half>* %a
   ret void
 }
 
-define void @frinta_v128f16(<128 x half>* %a) #0 {
+define void @frinta_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frinta_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frinta [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call <128 x half> @llvm.round.v128f16(<128 x half> %op)
   store <128 x half> %res, <128 x half>* %a
@@ -1094,30 +1170,33 @@ define void @frinta_v128f16(<128 x half>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @frinta_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frinta_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinta_v2f32:
-; CHECK: frinta v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinta v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.round.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @frinta_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frinta_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinta_v4f32:
-; CHECK: frinta v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinta v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.round.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
 
-define void @frinta_v8f32(<8 x float>* %a) #0 {
+define void @frinta_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinta_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frinta [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call <8 x float> @llvm.round.v8f32(<8 x float> %op)
   store <8 x float> %res, <8 x float>* %a
@@ -1125,49 +1204,53 @@ define void @frinta_v8f32(<8 x float>* %a) #0 {
 }
 
 define void @frinta_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frinta_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frinta [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frinta [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frinta [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frinta_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frinta z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    frinta z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frinta_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frinta z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call <16 x float> @llvm.round.v16f32(<16 x float> %op)
   store <16 x float> %res, <16 x float>* %a
   ret void
 }
 
-define void @frinta_v32f32(<32 x float>* %a) #0 {
+define void @frinta_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frinta_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frinta [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call <32 x float> @llvm.round.v32f32(<32 x float> %op)
   store <32 x float> %res, <32 x float>* %a
   ret void
 }
 
-define void @frinta_v64f32(<64 x float>* %a) #0 {
+define void @frinta_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frinta_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frinta [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call <64 x float> @llvm.round.v64f32(<64 x float> %op)
   store <64 x float> %res, <64 x float>* %a
@@ -1175,30 +1258,33 @@ define void @frinta_v64f32(<64 x float>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @frinta_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frinta_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinta_v1f64:
-; CHECK: frinta d0, d0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinta d0, d0
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.round.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @frinta_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frinta_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinta_v2f64:
-; CHECK: frinta v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinta v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.round.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
 
-define void @frinta_v4f64(<4 x double>* %a) #0 {
+define void @frinta_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frinta_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frinta [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frinta z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call <4 x double> @llvm.round.v4f64(<4 x double> %op)
   store <4 x double> %res, <4 x double>* %a
@@ -1206,49 +1292,53 @@ define void @frinta_v4f64(<4 x double>* %a) #0 {
 }
 
 define void @frinta_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frinta_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frinta [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frinta [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frinta [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frinta_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frinta z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    frinta z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frinta_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frinta z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call <8 x double> @llvm.round.v8f64(<8 x double> %op)
   store <8 x double> %res, <8 x double>* %a
   ret void
 }
 
-define void @frinta_v16f64(<16 x double>* %a) #0 {
+define void @frinta_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frinta_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frinta [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frinta z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call <16 x double> @llvm.round.v16f64(<16 x double> %op)
   store <16 x double> %res, <16 x double>* %a
   ret void
 }
 
-define void @frinta_v32f64(<32 x double>* %a) #0 {
+define void @frinta_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frinta_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frinta [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frinta z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call <32 x double> @llvm.round.v32f64(<32 x double> %op)
   store <32 x double> %res, <32 x double>* %a
@@ -1260,30 +1350,33 @@ define void @frinta_v32f64(<32 x double>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @frintn_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frintn_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintn_v4f16:
-; CHECK: frintn v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintn v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @frintn_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frintn_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintn_v8f16:
-; CHECK: frintn v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintn v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.roundeven.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
 
-define void @frintn_v16f16(<16 x half>* %a) #0 {
+define void @frintn_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintn_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintn [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call <16 x half> @llvm.roundeven.v16f16(<16 x half> %op)
   store <16 x half> %res, <16 x half>* %a
@@ -1291,49 +1384,53 @@ define void @frintn_v16f16(<16 x half>* %a) #0 {
 }
 
 define void @frintn_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frintn_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintn [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frintn [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frintn [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintn_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintn z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    frintn z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintn_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintn z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call <32 x half> @llvm.roundeven.v32f16(<32 x half> %op)
   store <32 x half> %res, <32 x half>* %a
   ret void
 }
 
-define void @frintn_v64f16(<64 x half>* %a) #0 {
+define void @frintn_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintn_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintn [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call <64 x half> @llvm.roundeven.v64f16(<64 x half> %op)
   store <64 x half> %res, <64 x half>* %a
   ret void
 }
 
-define void @frintn_v128f16(<128 x half>* %a) #0 {
+define void @frintn_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintn_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintn [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call <128 x half> @llvm.roundeven.v128f16(<128 x half> %op)
   store <128 x half> %res, <128 x half>* %a
@@ -1341,30 +1438,33 @@ define void @frintn_v128f16(<128 x half>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @frintn_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frintn_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintn_v2f32:
-; CHECK: frintn v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintn v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @frintn_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frintn_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintn_v4f32:
-; CHECK: frintn v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintn v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
 
-define void @frintn_v8f32(<8 x float>* %a) #0 {
+define void @frintn_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintn_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintn [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %op)
   store <8 x float> %res, <8 x float>* %a
@@ -1372,49 +1472,53 @@ define void @frintn_v8f32(<8 x float>* %a) #0 {
 }
 
 define void @frintn_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frintn_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintn [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frintn [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frintn [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintn_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintn z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    frintn z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintn_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintn z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call <16 x float> @llvm.roundeven.v16f32(<16 x float> %op)
   store <16 x float> %res, <16 x float>* %a
   ret void
 }
 
-define void @frintn_v32f32(<32 x float>* %a) #0 {
+define void @frintn_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintn_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintn [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call <32 x float> @llvm.roundeven.v32f32(<32 x float> %op)
   store <32 x float> %res, <32 x float>* %a
   ret void
 }
 
-define void @frintn_v64f32(<64 x float>* %a) #0 {
+define void @frintn_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintn_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintn [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call <64 x float> @llvm.roundeven.v64f32(<64 x float> %op)
   store <64 x float> %res, <64 x float>* %a
@@ -1422,30 +1526,33 @@ define void @frintn_v64f32(<64 x float>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @frintn_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frintn_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintn_v1f64:
-; CHECK: frintn d0, d0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintn d0, d0
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @frintn_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frintn_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintn_v2f64:
-; CHECK: frintn v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintn v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
 
-define void @frintn_v4f64(<4 x double>* %a) #0 {
+define void @frintn_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintn_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintn [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintn z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %op)
   store <4 x double> %res, <4 x double>* %a
@@ -1453,49 +1560,53 @@ define void @frintn_v4f64(<4 x double>* %a) #0 {
 }
 
 define void @frintn_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frintn_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintn [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frintn [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frintn [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintn_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintn z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    frintn z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintn_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintn z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %op)
   store <8 x double> %res, <8 x double>* %a
   ret void
 }
 
-define void @frintn_v16f64(<16 x double>* %a) #0 {
+define void @frintn_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintn_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintn [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintn z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call <16 x double> @llvm.roundeven.v16f64(<16 x double> %op)
   store <16 x double> %res, <16 x double>* %a
   ret void
 }
 
-define void @frintn_v32f64(<32 x double>* %a) #0 {
+define void @frintn_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintn_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintn [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintn z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call <32 x double> @llvm.roundeven.v32f64(<32 x double> %op)
   store <32 x double> %res, <32 x double>* %a
@@ -1507,30 +1618,33 @@ define void @frintn_v32f64(<32 x double>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @frintz_v4f16(<4 x half> %op) #0 {
+define <4 x half> @frintz_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintz_v4f16:
-; CHECK: frintz v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintz v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = call <4 x half> @llvm.trunc.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @frintz_v8f16(<8 x half> %op) #0 {
+define <8 x half> @frintz_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintz_v8f16:
-; CHECK: frintz v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintz v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = call <8 x half> @llvm.trunc.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
 
-define void @frintz_v16f16(<16 x half>* %a) #0 {
+define void @frintz_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintz_v16f16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x half>, <16 x half>* %a
   %res = call <16 x half> @llvm.trunc.v16f16(<16 x half> %op)
   store <16 x half> %res, <16 x half>* %a
@@ -1538,49 +1652,53 @@ define void @frintz_v16f16(<16 x half>* %a) #0 {
 }
 
 define void @frintz_v32f16(<32 x half>* %a) #0 {
-; CHECK-LABEL: frintz_v32f16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: frintz [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: frintz [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintz_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintz z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    frintz z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintz_v32f16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintz z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x half>, <32 x half>* %a
   %res = call <32 x half> @llvm.trunc.v32f16(<32 x half> %op)
   store <32 x half> %res, <32 x half>* %a
   ret void
 }
 
-define void @frintz_v64f16(<64 x half>* %a) #0 {
+define void @frintz_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintz_v64f16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x half>, <64 x half>* %a
   %res = call <64 x half> @llvm.trunc.v64f16(<64 x half> %op)
   store <64 x half> %res, <64 x half>* %a
   ret void
 }
 
-define void @frintz_v128f16(<128 x half>* %a) #0 {
+define void @frintz_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintz_v128f16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x half>, <128 x half>* %a
   %res = call <128 x half> @llvm.trunc.v128f16(<128 x half> %op)
   store <128 x half> %res, <128 x half>* %a
@@ -1588,30 +1706,33 @@ define void @frintz_v128f16(<128 x half>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @frintz_v2f32(<2 x float> %op) #0 {
+define <2 x float> @frintz_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintz_v2f32:
-; CHECK: frintz v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintz v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = call <2 x float> @llvm.trunc.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @frintz_v4f32(<4 x float> %op) #0 {
+define <4 x float> @frintz_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintz_v4f32:
-; CHECK: frintz v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintz v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
 
-define void @frintz_v8f32(<8 x float>* %a) #0 {
+define void @frintz_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintz_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x float>, <8 x float>* %a
   %res = call <8 x float> @llvm.trunc.v8f32(<8 x float> %op)
   store <8 x float> %res, <8 x float>* %a
@@ -1619,49 +1740,53 @@ define void @frintz_v8f32(<8 x float>* %a) #0 {
 }
 
 define void @frintz_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: frintz_v16f32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: frintz [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: frintz [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintz_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintz z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    frintz z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintz_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintz z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x float>, <16 x float>* %a
   %res = call <16 x float> @llvm.trunc.v16f32(<16 x float> %op)
   store <16 x float> %res, <16 x float>* %a
   ret void
 }
 
-define void @frintz_v32f32(<32 x float>* %a) #0 {
+define void @frintz_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintz_v32f32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x float>, <32 x float>* %a
   %res = call <32 x float> @llvm.trunc.v32f32(<32 x float> %op)
   store <32 x float> %res, <32 x float>* %a
   ret void
 }
 
-define void @frintz_v64f32(<64 x float>* %a) #0 {
+define void @frintz_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintz_v64f32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x float>, <64 x float>* %a
   %res = call <64 x float> @llvm.trunc.v64f32(<64 x float> %op)
   store <64 x float> %res, <64 x float>* %a
@@ -1669,30 +1794,33 @@ define void @frintz_v64f32(<64 x float>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @frintz_v1f64(<1 x double> %op) #0 {
+define <1 x double> @frintz_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintz_v1f64:
-; CHECK: frintz d0, d0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintz d0, d0
+; CHECK-NEXT:    ret
   %res = call <1 x double> @llvm.trunc.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @frintz_v2f64(<2 x double> %op) #0 {
+define <2 x double> @frintz_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintz_v2f64:
-; CHECK: frintz v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintz v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = call <2 x double> @llvm.trunc.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
 
-define void @frintz_v4f64(<4 x double>* %a) #0 {
+define void @frintz_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: frintz_v4f64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: frintz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintz z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x double>, <4 x double>* %a
   %res = call <4 x double> @llvm.trunc.v4f64(<4 x double> %op)
   store <4 x double> %res, <4 x double>* %a
@@ -1700,49 +1828,53 @@ define void @frintz_v4f64(<4 x double>* %a) #0 {
 }
 
 define void @frintz_v8f64(<8 x double>* %a) #0 {
-; CHECK-LABEL: frintz_v8f64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: frintz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: frintz [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: frintz [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: frintz_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    frintz z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    frintz z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: frintz_v8f64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    frintz z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x double>, <8 x double>* %a
   %res = call <8 x double> @llvm.trunc.v8f64(<8 x double> %op)
   store <8 x double> %res, <8 x double>* %a
   ret void
 }
 
-define void @frintz_v16f64(<16 x double>* %a) #0 {
+define void @frintz_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: frintz_v16f64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: frintz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintz z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x double>, <16 x double>* %a
   %res = call <16 x double> @llvm.trunc.v16f64(<16 x double> %op)
   store <16 x double> %res, <16 x double>* %a
   ret void
 }
 
-define void @frintz_v32f64(<32 x double>* %a) #0 {
+define void @frintz_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: frintz_v32f64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: frintz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    frintz z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x double>, <32 x double>* %a
   %res = call <32 x double> @llvm.trunc.v32f64(<32 x double> %op)
   store <32 x double> %res, <32 x double>* %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
index 3539fcbf28b7..6d2d4227bfd3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
@@ -1,36 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v4f16:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w0, #0x1
-; NO_SVE-NEXT:    csetm w8, ne
-; NO_SVE-NEXT:    dup v2.4h, w8
-; NO_SVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NO_SVE-NEXT:    ret
-;
+define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tst w0, #0x1
@@ -43,15 +19,7 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v8f16:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w0, #0x1
-; NO_SVE-NEXT:    csetm w8, ne
-; NO_SVE-NEXT:    dup v2.8h, w8
-; NO_SVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NO_SVE-NEXT:    ret
-;
+define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tst w0, #0x1
@@ -63,21 +31,7 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) #0 {
   ret <8 x half> %sel
 }
 
-define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v16f16:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w2, #0x1
-; NO_SVE-NEXT:    ldr q0, [x0]
-; NO_SVE-NEXT:    csetm w8, ne
-; NO_SVE-NEXT:    ldr q1, [x0, #16]
-; NO_SVE-NEXT:    ldr q2, [x1]
-; NO_SVE-NEXT:    ldr q3, [x1, #16]
-; NO_SVE-NEXT:    dup v4.8h, w8
-; NO_SVE-NEXT:    bif v0.16b, v2.16b, v4.16b
-; NO_SVE-NEXT:    bif v1.16b, v3.16b, v4.16b
-; NO_SVE-NEXT:    stp q0, q1, [x0]
-; NO_SVE-NEXT:    ret
-;
+define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w2, #0x1
@@ -99,26 +53,24 @@ define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) #0 {
 }
 
 define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v32f16:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w2, #0x1
-; NO_SVE-NEXT:    ldr q0, [x0, #48]
-; NO_SVE-NEXT:    csetm w8, ne
-; NO_SVE-NEXT:    ldr q1, [x0]
-; NO_SVE-NEXT:    ldr q2, [x0, #16]
-; NO_SVE-NEXT:    ldr q3, [x0, #32]
-; NO_SVE-NEXT:    ldr q4, [x1, #48]
-; NO_SVE-NEXT:    dup v6.8h, w8
-; NO_SVE-NEXT:    ldr q5, [x1]
-; NO_SVE-NEXT:    ldr q7, [x1, #16]
-; NO_SVE-NEXT:    ldr q16, [x1, #32]
-; NO_SVE-NEXT:    bif v1.16b, v5.16b, v6.16b
-; NO_SVE-NEXT:    bif v2.16b, v7.16b, v6.16b
-; NO_SVE-NEXT:    bif v0.16b, v4.16b, v6.16b
-; NO_SVE-NEXT:    bif v3.16b, v16.16b, v6.16b
-; NO_SVE-NEXT:    stp q1, q2, [x0]
-; NO_SVE-NEXT:    stp q3, q0, [x0, #32]
-; NO_SVE-NEXT:    ret
+; VBITS_GE_256-LABEL: select_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    and w9, w2, #0x1
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ptrue p1.h
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov z4.h, w9
+; VBITS_GE_256-NEXT:    and z4.h, z4.h, #0x1
+; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z4.h, #0
+; VBITS_GE_256-NEXT:    sel z1.h, p1, z1.h, z3.h
+; VBITS_GE_256-NEXT:    sel z0.h, p1, z0.h, z2.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v32f16:
 ; VBITS_GE_512:       // %bb.0:
@@ -140,58 +92,20 @@ define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v64f16:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w2, #0x1
-; NO_SVE-NEXT:    ldr q0, [x0, #16]
-; NO_SVE-NEXT:    csetm w8, ne
-; NO_SVE-NEXT:    ldr q1, [x0]
-; NO_SVE-NEXT:    ldr q2, [x0, #48]
-; NO_SVE-NEXT:    ldr q3, [x0, #32]
-; NO_SVE-NEXT:    ldr q4, [x0, #80]
-; NO_SVE-NEXT:    dup v21.8h, w8
-; NO_SVE-NEXT:    ldr q5, [x0, #64]
-; NO_SVE-NEXT:    ldr q6, [x0, #112]
-; NO_SVE-NEXT:    ldr q7, [x0, #96]
-; NO_SVE-NEXT:    ldr q16, [x1, #16]
-; NO_SVE-NEXT:    ldr q17, [x1]
-; NO_SVE-NEXT:    ldr q18, [x1, #48]
-; NO_SVE-NEXT:    ldr q19, [x1, #32]
-; NO_SVE-NEXT:    bif v0.16b, v16.16b, v21.16b
-; NO_SVE-NEXT:    ldr q20, [x1, #80]
-; NO_SVE-NEXT:    bif v1.16b, v17.16b, v21.16b
-; NO_SVE-NEXT:    ldr q16, [x1, #64]
-; NO_SVE-NEXT:    bif v2.16b, v18.16b, v21.16b
-; NO_SVE-NEXT:    ldr q17, [x1, #112]
-; NO_SVE-NEXT:    bif v3.16b, v19.16b, v21.16b
-; NO_SVE-NEXT:    ldr q18, [x1, #96]
-; NO_SVE-NEXT:    bif v4.16b, v20.16b, v21.16b
-; NO_SVE-NEXT:    stp q1, q0, [x0]
-; NO_SVE-NEXT:    mov v0.16b, v21.16b
-; NO_SVE-NEXT:    mov v1.16b, v21.16b
-; NO_SVE-NEXT:    stp q3, q2, [x0, #32]
-; NO_SVE-NEXT:    mov v2.16b, v21.16b
-; NO_SVE-NEXT:    bsl v0.16b, v5.16b, v16.16b
-; NO_SVE-NEXT:    bsl v1.16b, v6.16b, v17.16b
-; NO_SVE-NEXT:    bsl v2.16b, v7.16b, v18.16b
-; NO_SVE-NEXT:    stp q0, q4, [x0, #64]
-; NO_SVE-NEXT:    stp q2, q1, [x0, #96]
-; NO_SVE-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: select_v64f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    and w8, w2, #0x1
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    ptrue p1.h
-; VBITS_GE_1024-NEXT:    mov z2.h, w8
-; VBITS_GE_1024-NEXT:    and z2.h, z2.h, #0x1
-; VBITS_GE_1024-NEXT:    cmpne p1.h, p1/z, z2.h, #0
-; VBITS_GE_1024-NEXT:    sel z0.h, p1, z0.h, z1.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    and z2.h, z2.h, #0x1
+; CHECK-NEXT:    cmpne p1.h, p1/z, z2.h, #0
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <64 x half>, <64 x half>* %a
   %op2 = load volatile <64 x half>, <64 x half>* %b
   %sel = select i1 %mask, <64 x half> %op1, <64 x half> %op2
@@ -199,103 +113,20 @@ define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v128f16:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
-; NO_SVE-NEXT:    .cfi_def_cfa_offset 32
-; NO_SVE-NEXT:    stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; NO_SVE-NEXT:    .cfi_offset b8, -8
-; NO_SVE-NEXT:    .cfi_offset b9, -16
-; NO_SVE-NEXT:    .cfi_offset b10, -24
-; NO_SVE-NEXT:    .cfi_offset b11, -32
-; NO_SVE-NEXT:    tst w2, #0x1
-; NO_SVE-NEXT:    ldr q0, [x0, #240]
-; NO_SVE-NEXT:    csetm w8, ne
-; NO_SVE-NEXT:    ldr q1, [x0, #224]
-; NO_SVE-NEXT:    ldr q2, [x0, #208]
-; NO_SVE-NEXT:    ldr q3, [x0, #192]
-; NO_SVE-NEXT:    ldr q4, [x0, #176]
-; NO_SVE-NEXT:    dup v8.8h, w8
-; NO_SVE-NEXT:    ldr q5, [x0, #160]
-; NO_SVE-NEXT:    ldr q6, [x0, #144]
-; NO_SVE-NEXT:    ldr q7, [x0, #128]
-; NO_SVE-NEXT:    ldr q16, [x0, #112]
-; NO_SVE-NEXT:    ldr q17, [x0, #96]
-; NO_SVE-NEXT:    ldr q18, [x0, #80]
-; NO_SVE-NEXT:    ldr q19, [x0, #64]
-; NO_SVE-NEXT:    ldr q20, [x0, #48]
-; NO_SVE-NEXT:    ldr q21, [x0, #32]
-; NO_SVE-NEXT:    ldr q22, [x0, #16]
-; NO_SVE-NEXT:    ldr q23, [x0]
-; NO_SVE-NEXT:    ldr q24, [x1, #240]
-; NO_SVE-NEXT:    ldr q25, [x1, #224]
-; NO_SVE-NEXT:    ldr q26, [x1, #208]
-; NO_SVE-NEXT:    ldr q27, [x1, #192]
-; NO_SVE-NEXT:    bif v0.16b, v24.16b, v8.16b
-; NO_SVE-NEXT:    ldr q28, [x1, #176]
-; NO_SVE-NEXT:    bif v1.16b, v25.16b, v8.16b
-; NO_SVE-NEXT:    ldr q29, [x1, #160]
-; NO_SVE-NEXT:    bif v2.16b, v26.16b, v8.16b
-; NO_SVE-NEXT:    ldr q30, [x1, #144]
-; NO_SVE-NEXT:    bif v3.16b, v27.16b, v8.16b
-; NO_SVE-NEXT:    ldr q31, [x1, #128]
-; NO_SVE-NEXT:    ldr q9, [x1, #112]
-; NO_SVE-NEXT:    ldr q10, [x1, #96]
-; NO_SVE-NEXT:    bif v4.16b, v28.16b, v8.16b
-; NO_SVE-NEXT:    ldr q28, [x1, #80]
-; NO_SVE-NEXT:    ldr q24, [x1, #64]
-; NO_SVE-NEXT:    ldr q25, [x1, #48]
-; NO_SVE-NEXT:    ldr q26, [x1, #32]
-; NO_SVE-NEXT:    ldr q27, [x1, #16]
-; NO_SVE-NEXT:    ldr q11, [x1]
-; NO_SVE-NEXT:    stp q3, q2, [x0, #192]
-; NO_SVE-NEXT:    stp q1, q0, [x0, #224]
-; NO_SVE-NEXT:    mov v0.16b, v8.16b
-; NO_SVE-NEXT:    mov v1.16b, v8.16b
-; NO_SVE-NEXT:    mov v2.16b, v8.16b
-; NO_SVE-NEXT:    bsl v0.16b, v5.16b, v29.16b
-; NO_SVE-NEXT:    bsl v1.16b, v6.16b, v30.16b
-; NO_SVE-NEXT:    bsl v2.16b, v7.16b, v31.16b
-; NO_SVE-NEXT:    mov v3.16b, v8.16b
-; NO_SVE-NEXT:    stp q0, q4, [x0, #160]
-; NO_SVE-NEXT:    mov v4.16b, v8.16b
-; NO_SVE-NEXT:    mov v0.16b, v8.16b
-; NO_SVE-NEXT:    stp q2, q1, [x0, #128]
-; NO_SVE-NEXT:    mov v1.16b, v8.16b
-; NO_SVE-NEXT:    bsl v3.16b, v16.16b, v9.16b
-; NO_SVE-NEXT:    bsl v4.16b, v17.16b, v10.16b
-; NO_SVE-NEXT:    bsl v0.16b, v18.16b, v28.16b
-; NO_SVE-NEXT:    bsl v1.16b, v19.16b, v24.16b
-; NO_SVE-NEXT:    mov v2.16b, v8.16b
-; NO_SVE-NEXT:    stp q4, q3, [x0, #96]
-; NO_SVE-NEXT:    mov v3.16b, v8.16b
-; NO_SVE-NEXT:    mov v4.16b, v8.16b
-; NO_SVE-NEXT:    stp q1, q0, [x0, #64]
-; NO_SVE-NEXT:    mov v0.16b, v8.16b
-; NO_SVE-NEXT:    bsl v2.16b, v20.16b, v25.16b
-; NO_SVE-NEXT:    bsl v3.16b, v21.16b, v26.16b
-; NO_SVE-NEXT:    bsl v4.16b, v22.16b, v27.16b
-; NO_SVE-NEXT:    bsl v0.16b, v23.16b, v11.16b
-; NO_SVE-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; NO_SVE-NEXT:    stp q3, q2, [x0, #32]
-; NO_SVE-NEXT:    stp q0, q4, [x0]
-; NO_SVE-NEXT:    ldp d11, d10, [sp], #32 // 16-byte Folded Reload
-; NO_SVE-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: select_v128f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    and w8, w2, #0x1
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    ptrue p1.h
-; VBITS_GE_2048-NEXT:    mov z2.h, w8
-; VBITS_GE_2048-NEXT:    and z2.h, z2.h, #0x1
-; VBITS_GE_2048-NEXT:    cmpne p1.h, p1/z, z2.h, #0
-; VBITS_GE_2048-NEXT:    sel z0.h, p1, z0.h, z1.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v128f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    and z2.h, z2.h, #0x1
+; CHECK-NEXT:    cmpne p1.h, p1/z, z2.h, #0
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <128 x half>, <128 x half>* %a
   %op2 = load volatile <128 x half>, <128 x half>* %b
   %sel = select i1 %mask, <128 x half> %op1, <128 x half> %op2
@@ -304,15 +135,7 @@ define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v2f32:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w0, #0x1
-; NO_SVE-NEXT:    csetm w8, ne
-; NO_SVE-NEXT:    dup v2.2s, w8
-; NO_SVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NO_SVE-NEXT:    ret
-;
+define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tst w0, #0x1
@@ -325,15 +148,7 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) #
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v4f32:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w0, #0x1
-; NO_SVE-NEXT:    csetm w8, ne
-; NO_SVE-NEXT:    dup v2.4s, w8
-; NO_SVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NO_SVE-NEXT:    ret
-;
+define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tst w0, #0x1
@@ -345,21 +160,7 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) #
   ret <4 x float> %sel
 }
 
-define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v8f32:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w2, #0x1
-; NO_SVE-NEXT:    ldr q0, [x0]
-; NO_SVE-NEXT:    csetm w8, ne
-; NO_SVE-NEXT:    ldr q1, [x0, #16]
-; NO_SVE-NEXT:    ldr q2, [x1]
-; NO_SVE-NEXT:    ldr q3, [x1, #16]
-; NO_SVE-NEXT:    dup v4.4s, w8
-; NO_SVE-NEXT:    bif v0.16b, v2.16b, v4.16b
-; NO_SVE-NEXT:    bif v1.16b, v3.16b, v4.16b
-; NO_SVE-NEXT:    stp q0, q1, [x0]
-; NO_SVE-NEXT:    ret
-;
+define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w2, #0x1
@@ -381,26 +182,24 @@ define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) #0 {
 }
 
 define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v16f32:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w2, #0x1
-; NO_SVE-NEXT:    ldr q0, [x0, #48]
-; NO_SVE-NEXT:    csetm w8, ne
-; NO_SVE-NEXT:    ldr q1, [x0]
-; NO_SVE-NEXT:    ldr q2, [x0, #16]
-; NO_SVE-NEXT:    ldr q3, [x0, #32]
-; NO_SVE-NEXT:    ldr q4, [x1, #48]
-; NO_SVE-NEXT:    dup v6.4s, w8
-; NO_SVE-NEXT:    ldr q5, [x1]
-; NO_SVE-NEXT:    ldr q7, [x1, #16]
-; NO_SVE-NEXT:    ldr q16, [x1, #32]
-; NO_SVE-NEXT:    bif v1.16b, v5.16b, v6.16b
-; NO_SVE-NEXT:    bif v2.16b, v7.16b, v6.16b
-; NO_SVE-NEXT:    bif v0.16b, v4.16b, v6.16b
-; NO_SVE-NEXT:    bif v3.16b, v16.16b, v6.16b
-; NO_SVE-NEXT:    stp q1, q2, [x0]
-; NO_SVE-NEXT:    stp q3, q0, [x0, #32]
-; NO_SVE-NEXT:    ret
+; VBITS_GE_256-LABEL: select_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    and w9, w2, #0x1
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ptrue p1.s
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov z4.s, w9
+; VBITS_GE_256-NEXT:    and z4.s, z4.s, #0x1
+; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z4.s, #0
+; VBITS_GE_256-NEXT:    sel z1.s, p1, z1.s, z3.s
+; VBITS_GE_256-NEXT:    sel z0.s, p1, z0.s, z2.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v16f32:
 ; VBITS_GE_512:       // %bb.0:
@@ -422,58 +221,20 @@ define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v32f32:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w2, #0x1
-; NO_SVE-NEXT:    ldr q0, [x0, #16]
-; NO_SVE-NEXT:    csetm w8, ne
-; NO_SVE-NEXT:    ldr q1, [x0]
-; NO_SVE-NEXT:    ldr q2, [x0, #48]
-; NO_SVE-NEXT:    ldr q3, [x0, #32]
-; NO_SVE-NEXT:    ldr q4, [x0, #80]
-; NO_SVE-NEXT:    dup v21.4s, w8
-; NO_SVE-NEXT:    ldr q5, [x0, #64]
-; NO_SVE-NEXT:    ldr q6, [x0, #112]
-; NO_SVE-NEXT:    ldr q7, [x0, #96]
-; NO_SVE-NEXT:    ldr q16, [x1, #16]
-; NO_SVE-NEXT:    ldr q17, [x1]
-; NO_SVE-NEXT:    ldr q18, [x1, #48]
-; NO_SVE-NEXT:    ldr q19, [x1, #32]
-; NO_SVE-NEXT:    bif v0.16b, v16.16b, v21.16b
-; NO_SVE-NEXT:    ldr q20, [x1, #80]
-; NO_SVE-NEXT:    bif v1.16b, v17.16b, v21.16b
-; NO_SVE-NEXT:    ldr q16, [x1, #64]
-; NO_SVE-NEXT:    bif v2.16b, v18.16b, v21.16b
-; NO_SVE-NEXT:    ldr q17, [x1, #112]
-; NO_SVE-NEXT:    bif v3.16b, v19.16b, v21.16b
-; NO_SVE-NEXT:    ldr q18, [x1, #96]
-; NO_SVE-NEXT:    bif v4.16b, v20.16b, v21.16b
-; NO_SVE-NEXT:    stp q1, q0, [x0]
-; NO_SVE-NEXT:    mov v0.16b, v21.16b
-; NO_SVE-NEXT:    mov v1.16b, v21.16b
-; NO_SVE-NEXT:    stp q3, q2, [x0, #32]
-; NO_SVE-NEXT:    mov v2.16b, v21.16b
-; NO_SVE-NEXT:    bsl v0.16b, v5.16b, v16.16b
-; NO_SVE-NEXT:    bsl v1.16b, v6.16b, v17.16b
-; NO_SVE-NEXT:    bsl v2.16b, v7.16b, v18.16b
-; NO_SVE-NEXT:    stp q0, q4, [x0, #64]
-; NO_SVE-NEXT:    stp q2, q1, [x0, #96]
-; NO_SVE-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: select_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    and w8, w2, #0x1
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    ptrue p1.s
-; VBITS_GE_1024-NEXT:    mov z2.s, w8
-; VBITS_GE_1024-NEXT:    and z2.s, z2.s, #0x1
-; VBITS_GE_1024-NEXT:    cmpne p1.s, p1/z, z2.s, #0
-; VBITS_GE_1024-NEXT:    sel z0.s, p1, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    and z2.s, z2.s, #0x1
+; CHECK-NEXT:    cmpne p1.s, p1/z, z2.s, #0
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <32 x float>, <32 x float>* %a
   %op2 = load volatile <32 x float>, <32 x float>* %b
   %sel = select i1 %mask, <32 x float> %op1, <32 x float> %op2
@@ -481,103 +242,20 @@ define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v64f32:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
-; NO_SVE-NEXT:    .cfi_def_cfa_offset 32
-; NO_SVE-NEXT:    stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; NO_SVE-NEXT:    .cfi_offset b8, -8
-; NO_SVE-NEXT:    .cfi_offset b9, -16
-; NO_SVE-NEXT:    .cfi_offset b10, -24
-; NO_SVE-NEXT:    .cfi_offset b11, -32
-; NO_SVE-NEXT:    tst w2, #0x1
-; NO_SVE-NEXT:    ldr q0, [x0, #240]
-; NO_SVE-NEXT:    csetm w8, ne
-; NO_SVE-NEXT:    ldr q1, [x0, #224]
-; NO_SVE-NEXT:    ldr q2, [x0, #208]
-; NO_SVE-NEXT:    ldr q3, [x0, #192]
-; NO_SVE-NEXT:    ldr q4, [x0, #176]
-; NO_SVE-NEXT:    dup v8.4s, w8
-; NO_SVE-NEXT:    ldr q5, [x0, #160]
-; NO_SVE-NEXT:    ldr q6, [x0, #144]
-; NO_SVE-NEXT:    ldr q7, [x0, #128]
-; NO_SVE-NEXT:    ldr q16, [x0, #112]
-; NO_SVE-NEXT:    ldr q17, [x0, #96]
-; NO_SVE-NEXT:    ldr q18, [x0, #80]
-; NO_SVE-NEXT:    ldr q19, [x0, #64]
-; NO_SVE-NEXT:    ldr q20, [x0, #48]
-; NO_SVE-NEXT:    ldr q21, [x0, #32]
-; NO_SVE-NEXT:    ldr q22, [x0, #16]
-; NO_SVE-NEXT:    ldr q23, [x0]
-; NO_SVE-NEXT:    ldr q24, [x1, #240]
-; NO_SVE-NEXT:    ldr q25, [x1, #224]
-; NO_SVE-NEXT:    ldr q26, [x1, #208]
-; NO_SVE-NEXT:    ldr q27, [x1, #192]
-; NO_SVE-NEXT:    bif v0.16b, v24.16b, v8.16b
-; NO_SVE-NEXT:    ldr q28, [x1, #176]
-; NO_SVE-NEXT:    bif v1.16b, v25.16b, v8.16b
-; NO_SVE-NEXT:    ldr q29, [x1, #160]
-; NO_SVE-NEXT:    bif v2.16b, v26.16b, v8.16b
-; NO_SVE-NEXT:    ldr q30, [x1, #144]
-; NO_SVE-NEXT:    bif v3.16b, v27.16b, v8.16b
-; NO_SVE-NEXT:    ldr q31, [x1, #128]
-; NO_SVE-NEXT:    ldr q9, [x1, #112]
-; NO_SVE-NEXT:    ldr q10, [x1, #96]
-; NO_SVE-NEXT:    bif v4.16b, v28.16b, v8.16b
-; NO_SVE-NEXT:    ldr q28, [x1, #80]
-; NO_SVE-NEXT:    ldr q24, [x1, #64]
-; NO_SVE-NEXT:    ldr q25, [x1, #48]
-; NO_SVE-NEXT:    ldr q26, [x1, #32]
-; NO_SVE-NEXT:    ldr q27, [x1, #16]
-; NO_SVE-NEXT:    ldr q11, [x1]
-; NO_SVE-NEXT:    stp q3, q2, [x0, #192]
-; NO_SVE-NEXT:    stp q1, q0, [x0, #224]
-; NO_SVE-NEXT:    mov v0.16b, v8.16b
-; NO_SVE-NEXT:    mov v1.16b, v8.16b
-; NO_SVE-NEXT:    mov v2.16b, v8.16b
-; NO_SVE-NEXT:    bsl v0.16b, v5.16b, v29.16b
-; NO_SVE-NEXT:    bsl v1.16b, v6.16b, v30.16b
-; NO_SVE-NEXT:    bsl v2.16b, v7.16b, v31.16b
-; NO_SVE-NEXT:    mov v3.16b, v8.16b
-; NO_SVE-NEXT:    stp q0, q4, [x0, #160]
-; NO_SVE-NEXT:    mov v4.16b, v8.16b
-; NO_SVE-NEXT:    mov v0.16b, v8.16b
-; NO_SVE-NEXT:    stp q2, q1, [x0, #128]
-; NO_SVE-NEXT:    mov v1.16b, v8.16b
-; NO_SVE-NEXT:    bsl v3.16b, v16.16b, v9.16b
-; NO_SVE-NEXT:    bsl v4.16b, v17.16b, v10.16b
-; NO_SVE-NEXT:    bsl v0.16b, v18.16b, v28.16b
-; NO_SVE-NEXT:    bsl v1.16b, v19.16b, v24.16b
-; NO_SVE-NEXT:    mov v2.16b, v8.16b
-; NO_SVE-NEXT:    stp q4, q3, [x0, #96]
-; NO_SVE-NEXT:    mov v3.16b, v8.16b
-; NO_SVE-NEXT:    mov v4.16b, v8.16b
-; NO_SVE-NEXT:    stp q1, q0, [x0, #64]
-; NO_SVE-NEXT:    mov v0.16b, v8.16b
-; NO_SVE-NEXT:    bsl v2.16b, v20.16b, v25.16b
-; NO_SVE-NEXT:    bsl v3.16b, v21.16b, v26.16b
-; NO_SVE-NEXT:    bsl v4.16b, v22.16b, v27.16b
-; NO_SVE-NEXT:    bsl v0.16b, v23.16b, v11.16b
-; NO_SVE-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; NO_SVE-NEXT:    stp q3, q2, [x0, #32]
-; NO_SVE-NEXT:    stp q0, q4, [x0]
-; NO_SVE-NEXT:    ldp d11, d10, [sp], #32 // 16-byte Folded Reload
-; NO_SVE-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: select_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    and w8, w2, #0x1
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    ptrue p1.s
-; VBITS_GE_2048-NEXT:    mov z2.s, w8
-; VBITS_GE_2048-NEXT:    and z2.s, z2.s, #0x1
-; VBITS_GE_2048-NEXT:    cmpne p1.s, p1/z, z2.s, #0
-; VBITS_GE_2048-NEXT:    sel z0.s, p1, z0.s, z1.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    and z2.s, z2.s, #0x1
+; CHECK-NEXT:    cmpne p1.s, p1/z, z2.s, #0
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <64 x float>, <64 x float>* %a
   %op2 = load volatile <64 x float>, <64 x float>* %b
   %sel = select i1 %mask, <64 x float> %op1, <64 x float> %op2
@@ -586,15 +264,7 @@ define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v1f64:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w0, #0x1
-; NO_SVE-NEXT:    csetm x8, ne
-; NO_SVE-NEXT:    fmov d2, x8
-; NO_SVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NO_SVE-NEXT:    ret
-;
+define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tst w0, #0x1
@@ -607,15 +277,7 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v2f64:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w0, #0x1
-; NO_SVE-NEXT:    csetm x8, ne
-; NO_SVE-NEXT:    dup v2.2d, x8
-; NO_SVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NO_SVE-NEXT:    ret
-;
+define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tst w0, #0x1
@@ -627,21 +289,7 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
   ret <2 x double> %sel
 }
 
-define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v4f64:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w2, #0x1
-; NO_SVE-NEXT:    ldr q0, [x0]
-; NO_SVE-NEXT:    csetm x8, ne
-; NO_SVE-NEXT:    ldr q1, [x0, #16]
-; NO_SVE-NEXT:    ldr q2, [x1]
-; NO_SVE-NEXT:    ldr q3, [x1, #16]
-; NO_SVE-NEXT:    dup v4.2d, x8
-; NO_SVE-NEXT:    bif v0.16b, v2.16b, v4.16b
-; NO_SVE-NEXT:    bif v1.16b, v3.16b, v4.16b
-; NO_SVE-NEXT:    stp q0, q1, [x0]
-; NO_SVE-NEXT:    ret
-;
+define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w2, #0x1
@@ -663,26 +311,24 @@ define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) #0 {
 }
 
 define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v8f64:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w2, #0x1
-; NO_SVE-NEXT:    ldr q0, [x0, #48]
-; NO_SVE-NEXT:    csetm x8, ne
-; NO_SVE-NEXT:    ldr q1, [x0]
-; NO_SVE-NEXT:    ldr q2, [x0, #16]
-; NO_SVE-NEXT:    ldr q3, [x0, #32]
-; NO_SVE-NEXT:    ldr q4, [x1, #48]
-; NO_SVE-NEXT:    dup v6.2d, x8
-; NO_SVE-NEXT:    ldr q5, [x1]
-; NO_SVE-NEXT:    ldr q7, [x1, #16]
-; NO_SVE-NEXT:    ldr q16, [x1, #32]
-; NO_SVE-NEXT:    bif v1.16b, v5.16b, v6.16b
-; NO_SVE-NEXT:    bif v2.16b, v7.16b, v6.16b
-; NO_SVE-NEXT:    bif v0.16b, v4.16b, v6.16b
-; NO_SVE-NEXT:    bif v3.16b, v16.16b, v6.16b
-; NO_SVE-NEXT:    stp q1, q2, [x0]
-; NO_SVE-NEXT:    stp q3, q0, [x0, #32]
-; NO_SVE-NEXT:    ret
+; VBITS_GE_256-LABEL: select_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    and w9, w2, #0x1
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ptrue p1.d
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov z4.d, x9
+; VBITS_GE_256-NEXT:    and z4.d, z4.d, #0x1
+; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z4.d, #0
+; VBITS_GE_256-NEXT:    sel z1.d, p1, z1.d, z3.d
+; VBITS_GE_256-NEXT:    sel z0.d, p1, z0.d, z2.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v8f64:
 ; VBITS_GE_512:       // %bb.0:
@@ -704,58 +350,20 @@ define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v16f64:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    tst w2, #0x1
-; NO_SVE-NEXT:    ldr q0, [x0, #16]
-; NO_SVE-NEXT:    csetm x8, ne
-; NO_SVE-NEXT:    ldr q1, [x0]
-; NO_SVE-NEXT:    ldr q2, [x0, #48]
-; NO_SVE-NEXT:    ldr q3, [x0, #32]
-; NO_SVE-NEXT:    ldr q4, [x0, #80]
-; NO_SVE-NEXT:    dup v21.2d, x8
-; NO_SVE-NEXT:    ldr q5, [x0, #64]
-; NO_SVE-NEXT:    ldr q6, [x0, #112]
-; NO_SVE-NEXT:    ldr q7, [x0, #96]
-; NO_SVE-NEXT:    ldr q16, [x1, #16]
-; NO_SVE-NEXT:    ldr q17, [x1]
-; NO_SVE-NEXT:    ldr q18, [x1, #48]
-; NO_SVE-NEXT:    ldr q19, [x1, #32]
-; NO_SVE-NEXT:    bif v0.16b, v16.16b, v21.16b
-; NO_SVE-NEXT:    ldr q20, [x1, #80]
-; NO_SVE-NEXT:    bif v1.16b, v17.16b, v21.16b
-; NO_SVE-NEXT:    ldr q16, [x1, #64]
-; NO_SVE-NEXT:    bif v2.16b, v18.16b, v21.16b
-; NO_SVE-NEXT:    ldr q17, [x1, #112]
-; NO_SVE-NEXT:    bif v3.16b, v19.16b, v21.16b
-; NO_SVE-NEXT:    ldr q18, [x1, #96]
-; NO_SVE-NEXT:    bif v4.16b, v20.16b, v21.16b
-; NO_SVE-NEXT:    stp q1, q0, [x0]
-; NO_SVE-NEXT:    mov v0.16b, v21.16b
-; NO_SVE-NEXT:    mov v1.16b, v21.16b
-; NO_SVE-NEXT:    stp q3, q2, [x0, #32]
-; NO_SVE-NEXT:    mov v2.16b, v21.16b
-; NO_SVE-NEXT:    bsl v0.16b, v5.16b, v16.16b
-; NO_SVE-NEXT:    bsl v1.16b, v6.16b, v17.16b
-; NO_SVE-NEXT:    bsl v2.16b, v7.16b, v18.16b
-; NO_SVE-NEXT:    stp q0, q4, [x0, #64]
-; NO_SVE-NEXT:    stp q2, q1, [x0, #96]
-; NO_SVE-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: select_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    and w8, w2, #0x1
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    ptrue p1.d
-; VBITS_GE_1024-NEXT:    mov z2.d, x8
-; VBITS_GE_1024-NEXT:    and z2.d, z2.d, #0x1
-; VBITS_GE_1024-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; VBITS_GE_1024-NEXT:    sel z0.d, p1, z0.d, z1.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z2.d, z2.d, #0x1
+; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <16 x double>, <16 x double>* %a
   %op2 = load volatile <16 x double>, <16 x double>* %b
   %sel = select i1 %mask, <16 x double> %op1, <16 x double> %op2
@@ -763,103 +371,20 @@ define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, i1 %mask) #0 {
-; NO_SVE-LABEL: select_v32f64:
-; NO_SVE:       // %bb.0:
-; NO_SVE-NEXT:    stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
-; NO_SVE-NEXT:    .cfi_def_cfa_offset 32
-; NO_SVE-NEXT:    stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; NO_SVE-NEXT:    .cfi_offset b8, -8
-; NO_SVE-NEXT:    .cfi_offset b9, -16
-; NO_SVE-NEXT:    .cfi_offset b10, -24
-; NO_SVE-NEXT:    .cfi_offset b11, -32
-; NO_SVE-NEXT:    tst w2, #0x1
-; NO_SVE-NEXT:    ldr q0, [x0, #240]
-; NO_SVE-NEXT:    csetm x8, ne
-; NO_SVE-NEXT:    ldr q1, [x0, #224]
-; NO_SVE-NEXT:    ldr q2, [x0, #208]
-; NO_SVE-NEXT:    ldr q3, [x0, #192]
-; NO_SVE-NEXT:    ldr q4, [x0, #176]
-; NO_SVE-NEXT:    dup v8.2d, x8
-; NO_SVE-NEXT:    ldr q5, [x0, #160]
-; NO_SVE-NEXT:    ldr q6, [x0, #144]
-; NO_SVE-NEXT:    ldr q7, [x0, #128]
-; NO_SVE-NEXT:    ldr q16, [x0, #112]
-; NO_SVE-NEXT:    ldr q17, [x0, #96]
-; NO_SVE-NEXT:    ldr q18, [x0, #80]
-; NO_SVE-NEXT:    ldr q19, [x0, #64]
-; NO_SVE-NEXT:    ldr q20, [x0, #48]
-; NO_SVE-NEXT:    ldr q21, [x0, #32]
-; NO_SVE-NEXT:    ldr q22, [x0, #16]
-; NO_SVE-NEXT:    ldr q23, [x0]
-; NO_SVE-NEXT:    ldr q24, [x1, #240]
-; NO_SVE-NEXT:    ldr q25, [x1, #224]
-; NO_SVE-NEXT:    ldr q26, [x1, #208]
-; NO_SVE-NEXT:    ldr q27, [x1, #192]
-; NO_SVE-NEXT:    bif v0.16b, v24.16b, v8.16b
-; NO_SVE-NEXT:    ldr q28, [x1, #176]
-; NO_SVE-NEXT:    bif v1.16b, v25.16b, v8.16b
-; NO_SVE-NEXT:    ldr q29, [x1, #160]
-; NO_SVE-NEXT:    bif v2.16b, v26.16b, v8.16b
-; NO_SVE-NEXT:    ldr q30, [x1, #144]
-; NO_SVE-NEXT:    bif v3.16b, v27.16b, v8.16b
-; NO_SVE-NEXT:    ldr q31, [x1, #128]
-; NO_SVE-NEXT:    ldr q9, [x1, #112]
-; NO_SVE-NEXT:    ldr q10, [x1, #96]
-; NO_SVE-NEXT:    bif v4.16b, v28.16b, v8.16b
-; NO_SVE-NEXT:    ldr q28, [x1, #80]
-; NO_SVE-NEXT:    ldr q24, [x1, #64]
-; NO_SVE-NEXT:    ldr q25, [x1, #48]
-; NO_SVE-NEXT:    ldr q26, [x1, #32]
-; NO_SVE-NEXT:    ldr q27, [x1, #16]
-; NO_SVE-NEXT:    ldr q11, [x1]
-; NO_SVE-NEXT:    stp q3, q2, [x0, #192]
-; NO_SVE-NEXT:    stp q1, q0, [x0, #224]
-; NO_SVE-NEXT:    mov v0.16b, v8.16b
-; NO_SVE-NEXT:    mov v1.16b, v8.16b
-; NO_SVE-NEXT:    mov v2.16b, v8.16b
-; NO_SVE-NEXT:    bsl v0.16b, v5.16b, v29.16b
-; NO_SVE-NEXT:    bsl v1.16b, v6.16b, v30.16b
-; NO_SVE-NEXT:    bsl v2.16b, v7.16b, v31.16b
-; NO_SVE-NEXT:    mov v3.16b, v8.16b
-; NO_SVE-NEXT:    stp q0, q4, [x0, #160]
-; NO_SVE-NEXT:    mov v4.16b, v8.16b
-; NO_SVE-NEXT:    mov v0.16b, v8.16b
-; NO_SVE-NEXT:    stp q2, q1, [x0, #128]
-; NO_SVE-NEXT:    mov v1.16b, v8.16b
-; NO_SVE-NEXT:    bsl v3.16b, v16.16b, v9.16b
-; NO_SVE-NEXT:    bsl v4.16b, v17.16b, v10.16b
-; NO_SVE-NEXT:    bsl v0.16b, v18.16b, v28.16b
-; NO_SVE-NEXT:    bsl v1.16b, v19.16b, v24.16b
-; NO_SVE-NEXT:    mov v2.16b, v8.16b
-; NO_SVE-NEXT:    stp q4, q3, [x0, #96]
-; NO_SVE-NEXT:    mov v3.16b, v8.16b
-; NO_SVE-NEXT:    mov v4.16b, v8.16b
-; NO_SVE-NEXT:    stp q1, q0, [x0, #64]
-; NO_SVE-NEXT:    mov v0.16b, v8.16b
-; NO_SVE-NEXT:    bsl v2.16b, v20.16b, v25.16b
-; NO_SVE-NEXT:    bsl v3.16b, v21.16b, v26.16b
-; NO_SVE-NEXT:    bsl v4.16b, v22.16b, v27.16b
-; NO_SVE-NEXT:    bsl v0.16b, v23.16b, v11.16b
-; NO_SVE-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; NO_SVE-NEXT:    stp q3, q2, [x0, #32]
-; NO_SVE-NEXT:    stp q0, q4, [x0]
-; NO_SVE-NEXT:    ldp d11, d10, [sp], #32 // 16-byte Folded Reload
-; NO_SVE-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: select_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    and w8, w2, #0x1
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    ptrue p1.d
-; VBITS_GE_2048-NEXT:    mov z2.d, x8
-; VBITS_GE_2048-NEXT:    and z2.d, z2.d, #0x1
-; VBITS_GE_2048-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; VBITS_GE_2048-NEXT:    sel z0.d, p1, z0.d, z1.d
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, i1 %mask) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z2.d, z2.d, #0x1
+; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <32 x double>, <32 x double>* %a
   %op2 = load volatile <32 x double>, <32 x double>* %b
   %sel = select i1 %mask, <32 x double> %op1, <32 x double> %op2

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
index f4806abd06e4..dd6b1e41fb4b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
@@ -1,58 +1,46 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; FCVTZU H -> H
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) #0 {
+define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v4f16_v4i16:
-; CHECK: fcvtzu v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define void @fcvtzu_v8f16_v8i16(<8 x half>* %a, <8 x i16>* %b) #0 {
+define void @fcvtzu_v8f16_v8i16(<8 x half>* %a, <8 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v8f16_v8i16:
-; CHECK: ldr q0, [x0]
-; CHECK-NEXT: fcvtzu v0.8h, v0.8h
-; CHECK-NEXT: str q0, [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    fcvtzu v0.8h, v0.8h
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <8 x half>, <8 x half>* %a
   %res = fptoui <8 x half> %op1 to <8 x i16>
   store <8 x i16> %res, <8 x i16>* %b
   ret void
 }
 
-define void @fcvtzu_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) #0 {
+define void @fcvtzu_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v16f16_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: fcvtzu [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %res = fptoui <16 x half> %op1 to <16 x i16>
   store <16 x i16> %res, <16 x i16>* %b
@@ -60,49 +48,53 @@ define void @fcvtzu_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @fcvtzu_v32f16_v32i16(<32 x half>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v32f16_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fcvtzu [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[LO]].h
-; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x1]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v32f16_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fcvtzu z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    fcvtzu z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v32f16_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fcvtzu z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %res = fptoui <32 x half> %op1 to <32 x i16>
   store <32 x i16> %res, <32 x i16>* %b
   ret void
 }
 
-define void @fcvtzu_v64f16_v64i16(<64 x half>* %a, <64 x i16>* %b) #0 {
+define void @fcvtzu_v64f16_v64i16(<64 x half>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzu_v64f16_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fcvtzu [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %res = fptoui <64 x half> %op1 to <64 x i16>
   store <64 x i16> %res, <64 x i16>* %b
   ret void
 }
 
-define void @fcvtzu_v128f16_v128i16(<128 x half>* %a, <128 x i16>* %b) #0 {
+define void @fcvtzu_v128f16_v128i16(<128 x half>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzu_v128f16_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %res = fptoui <128 x half> %op1 to <128 x i16>
   store <128 x i16> %res, <128 x i16>* %b
@@ -114,32 +106,37 @@ define void @fcvtzu_v128f16_v128i16(<128 x half>* %a, <128 x i16>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) #0 {
+define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v2f16_v2i32:
-; CHECK: fcvtl v0.4s, v0.4h
-; CHECK-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) #0 {
+define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v4f16_v4i32:
-; CHECK: fcvtzu v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
 
-define void @fcvtzu_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) #0 {
+define void @fcvtzu_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v8f16_v8i32:
-; CHECK: ldr q[[OP:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].s, z[[OP]].h
-; CHECK-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG]]/m, [[UPK]].h
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <8 x half>, <8 x half>* %a
   %res = fptoui <8 x half> %op1 to <8 x i32>
   store <8 x i32> %res, <8 x i32>* %b
@@ -147,57 +144,62 @@ define void @fcvtzu_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @fcvtzu_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v16f16_v16i32:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_512-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: ld1h { [[VEC:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].s, [[VEC]].h
-; VBITS_EQ_256-DAG: ext [[VEC_HI:z[0-9]+]].b, [[VEC]].b, [[VEC]].b, #16
-; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].s, [[VEC_HI]].h
-; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].h
-; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].h
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG2]], [x1]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG2]], [x1, x[[NUMELTS]], lsl #2]
+; VBITS_GE_256-LABEL: fcvtzu_v16f16_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    fcvtzu z1.s, p0/m, z1.h
+; VBITS_GE_256-NEXT:    fcvtzu z0.s, p0/m, z0.h
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v16f16_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    fcvtzu z0.s, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %res = fptoui <16 x half> %op1 to <16 x i32>
   store <16 x i32> %res, <16 x i32>* %b
   ret void
 }
 
-define void @fcvtzu_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) #0 {
+define void @fcvtzu_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzu_v32f16_v32i32:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_1024-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %res = fptoui <32 x half> %op1 to <32 x i32>
   store <32 x i32> %res, <32 x i32>* %b
   ret void
 }
 
-define void @fcvtzu_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) #0 {
+define void @fcvtzu_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzu_v64f16_v64i32:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %res = fptoui <64 x half> %op1 to <64 x i32>
   store <64 x i32> %res, <64 x i32>* %b
@@ -209,36 +211,41 @@ define void @fcvtzu_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) #0 {
+define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v1f16_v1i64:
-; CHECK: fcvtzu x8, h0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu x8, h0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
   %res = fptoui <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
 
 ; v2f16 is not legal for NEON, so use SVE
-define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) #0 {
+define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v2f16_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z0.h
-; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; CHECK-NEXT: fcvtzu z0.d, [[PG]]/m, [[UPK2]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
 
-define void @fcvtzu_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) #0 {
+define void @fcvtzu_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v4f16_v4i64:
-; CHECK: ldr d[[OP:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[OP]].h
-; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; CHECK-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK2]].h
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <4 x half>, <4 x half>* %a
   %res = fptoui <4 x half> %op1 to <4 x i64>
   store <4 x i64> %res, <4 x i64>* %b
@@ -246,61 +253,65 @@ define void @fcvtzu_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @fcvtzu_v8f16_v8i64(<8 x half>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v8f16_v8i64:
-; VBITS_GE_512: ldr q[[OP:[0-9]+]], [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[OP]].h
-; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; VBITS_GE_512-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK2]].h
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ldr q[[OP:[0-9]+]], [x0]
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ext v[[HI:[0-9]+]].16b, v[[LO:[0-9]+]].16b, v[[OP]].16b, #8
-; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[LO]].h
-; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h
-; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s
-; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s
-; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK2_LO]].h
-; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK2_HI]].h
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v8f16_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ldr q0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    fcvtzu z1.d, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v8f16_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ldr q0, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x half>, <8 x half>* %a
   %res = fptoui <8 x half> %op1 to <8 x i64>
   store <8 x i64> %res, <8 x i64>* %b
   ret void
 }
 
-define void @fcvtzu_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) #0 {
+define void @fcvtzu_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzu_v16f16_v16i64:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; VBITS_GE_1024-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK2]].h
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %res = fptoui <16 x half> %op1 to <16 x i64>
   store <16 x i64> %res, <16 x i64>* %b
   ret void
 }
 
-define void @fcvtzu_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) #0 {
+define void @fcvtzu_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzu_v32f16_v32i64:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK]].s
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK2]].h
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %res = fptoui <32 x half> %op1 to <32 x i64>
   store <32 x i64> %res, <32 x i64>* %b
@@ -312,101 +323,110 @@ define void @fcvtzu_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) #0 {
+define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v2f32_v2i16:
-; CHECK: fcvtzs v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) #0 {
+define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v4f32_v4i16:
-; CHECK: fcvtzu v1.4s, v0.4s
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov w9, v1.s[2]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.h[1], w8
-; CHECK-NEXT: mov w8, v1.s[3]
-; CHECK-NEXT: mov v0.h[2], w9
-; CHECK-NEXT: mov v0.h[3], w8
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v1.4s, v0.4s
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    mov w9, v1.s[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.h[1], w8
+; CHECK-NEXT:    mov w8, v1.s[3]
+; CHECK-NEXT:    mov v0.h[2], w9
+; CHECK-NEXT:    mov v0.h[3], w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
 
-define <8 x i16> @fcvtzu_v8f32_v8i16(<8 x float>* %a) #0 {
+define <8 x i16> @fcvtzu_v8f32_v8i16(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v8f32_v8i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s
-; CHECK-NEXT: fcvtzu [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s
-; CHECK-NEXT: uzp1 z0.h, [[CVT]].h, [[CVT]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %res = fptoui <8 x float> %op1 to <8 x i16>
   ret <8 x i16> %res
 }
 
 define void @fcvtzu_v16f32_v16i16(<16 x float>* %a, <16 x i16>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v16f32_v16i16:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_512-NEXT: fcvtzu [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl8
-; VBITS_EQ_256-DAG: fcvtzu [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].s
-; VBITS_EQ_256-DAG: fcvtzu [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h
-; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h
-; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h
-; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: st1h { [[RES]].h }, [[PG4]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v16f32_v16i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s
+; VBITS_GE_256-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    fcvtzu z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
+; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v16f32_v16i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.s
+; VBITS_GE_512-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %res = fptoui <16 x float> %op1 to <16 x i16>
   store <16 x i16> %res, <16 x i16>* %b
   ret void
 }
 
-define void @fcvtzu_v32f32_v32i16(<32 x float>* %a, <32 x i16>* %b) #0 {
+define void @fcvtzu_v32f32_v32i16(<32 x float>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzu_v32f32_v32i16:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_1024-NEXT: fcvtzu [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %res = fptoui <32 x float> %op1 to <32 x i16>
   store <32 x i16> %res, <32 x i16>* %b
   ret void
 }
 
-define void @fcvtzu_v64f32_v64i16(<64 x float>* %a, <64 x i16>* %b) #0 {
+define void @fcvtzu_v64f32_v64i16(<64 x float>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzu_v64f32_v64i16:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].s
-; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %res = fptoui <64 x float> %op1 to <64 x i16>
   store <64 x i16> %res, <64 x i16>* %b
@@ -418,30 +438,33 @@ define void @fcvtzu_v64f32_v64i16(<64 x float>* %a, <64 x i16>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) #0 {
+define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v2f32_v2i32:
-; CHECK: fcvtzu v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) #0 {
+define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v4f32_v4i32:
-; CHECK: fcvtzu v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
 
-define void @fcvtzu_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) #0 {
+define void @fcvtzu_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v8f32_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %res = fptoui <8 x float> %op1 to <8 x i32>
   store <8 x i32> %res, <8 x i32>* %b
@@ -449,49 +472,53 @@ define void @fcvtzu_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @fcvtzu_v16f32_v16i32(<16 x float>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v16f32_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[LO]].s
-; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x1]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v16f32_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    fcvtzu z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v16f32_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %res = fptoui <16 x float> %op1 to <16 x i32>
   store <16 x i32> %res, <16 x i32>* %b
   ret void
 }
 
-define void @fcvtzu_v32f32_v32i32(<32 x float>* %a, <32 x i32>* %b) #0 {
+define void @fcvtzu_v32f32_v32i32(<32 x float>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzu_v32f32_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %res = fptoui <32 x float> %op1 to <32 x i32>
   store <32 x i32> %res, <32 x i32>* %b
   ret void
 }
 
-define void @fcvtzu_v64f32_v64i32(<64 x float>* %a, <64 x i32>* %b) #0 {
+define void @fcvtzu_v64f32_v64i32(<64 x float>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzu_v64f32_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %res = fptoui <64 x float> %op1 to <64 x i32>
   store <64 x i32> %res, <64 x i32>* %b
@@ -503,33 +530,37 @@ define void @fcvtzu_v64f32_v64i32(<64 x float>* %a, <64 x i32>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) #0 {
+define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v1f32_v1i64:
-; CHECK: fcvtl v0.2d, v0.2s
-; CHECK-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %res = fptoui <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) #0 {
+define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v2f32_v2i64:
-; CHECK: fcvtl v0.2d, v0.2s
-; CHECK-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
 
-define void @fcvtzu_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) #0 {
+define void @fcvtzu_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v4f32_v4i64:
-; CHECK: ldr q[[OP:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[OP]].s
-; CHECK-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK]].s
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <4 x float>, <4 x float>* %a
   %res = fptoui <4 x float> %op1 to <4 x i64>
   store <4 x i64> %res, <4 x i64>* %b
@@ -537,57 +568,62 @@ define void @fcvtzu_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @fcvtzu_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v8f32_v8i64:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
-; VBITS_GE_512-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG1]]/m, [[UPK]].s
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: ld1w { [[VEC:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].d, [[VEC]].s
-; VBITS_EQ_256-DAG: ext [[VEC_HI:z[0-9]+]].b, [[VEC]].b, [[VEC]].b, #16
-; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].d, [[VEC_HI]].s
-; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].s
-; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].s
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x1, x[[NUMELTS]], lsl #3]
+; VBITS_GE_256-LABEL: fcvtzu_v8f32_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    fcvtzu z1.d, p0/m, z1.s
+; VBITS_GE_256-NEXT:    fcvtzu z0.d, p0/m, z0.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v8f32_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT:    fcvtzu z0.d, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %res = fptoui <8 x float> %op1 to <8 x i64>
   store <8 x i64> %res, <8 x i64>* %b
   ret void
 }
 
-define void @fcvtzu_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) #0 {
+define void @fcvtzu_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzu_v16f32_v16i64:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
-; VBITS_GE_1024-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK]].s
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %res = fptoui <16 x float> %op1 to <16 x i64>
   store <16 x i64> %res, <16 x i64>* %b
   ret void
 }
 
-define void @fcvtzu_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) #0 {
+define void @fcvtzu_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzu_v32f32_v32i64:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl32
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK]].s
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %res = fptoui <32 x float> %op1 to <32 x i64>
   store <32 x i64> %res, <32 x i64>* %b
@@ -600,98 +636,110 @@ define void @fcvtzu_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; v1f64 is perfered to be widened to v4f64, so use SVE
-define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) #0 {
+define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v1f64_v1i16:
-; CHECK: ptrue [[PG:p[0-9]+]].d
-; CHECK-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG]]/m, z0.d
-; CHECK-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; CHECK-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) #0 {
+define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v2f64_v2i16:
-; CHECK: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
 
-define <4 x i16> @fcvtzu_v4f64_v4i16(<4 x double>* %a) #0 {
+define <4 x i16> @fcvtzu_v4f64_v4i16(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v4f64_v4i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
-; CHECK-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; CHECK-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; CHECK-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %res = fptoui <4 x double> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
 
 define <8 x i16> @fcvtzu_v8f64_v8i16(<8 x double>* %a) #0 {
-; CHECK-LABEL: fcvtzu_v8f64_v8i16:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_512-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_512-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d
-; VBITS_EQ_256-DAG: fcvtzu [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d
-; VBITS_EQ_256-DAG: fcvtzu [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d
-; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s
-; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h
-; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h
-; VBITS_EQ_256-NEXT: mov v0.d[1], v[[RES_HI]].d[0]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d
+; VBITS_GE_256-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    fcvtzu z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    mov v0.d[1], v2.d[0]
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v8f64_v8i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.d
+; VBITS_GE_512-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %res = fptoui <8 x double> %op1 to <8 x i16>
   ret <8 x i16> %res
 }
 
-define void @fcvtzu_v16f64_v16i16(<16 x double>* %a, <16 x i16>* %b) #0 {
+define void @fcvtzu_v16f64_v16i16(<16 x double>* %a, <16 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzu_v16f64_v16i16:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_1024-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16
-; VBITS_GE_1024-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %res = fptoui <16 x double> %op1 to <16 x i16>
   store <16 x i16> %res, <16 x i16>* %b
   ret void
 }
 
-define void @fcvtzu_v32f64_v32i16(<32 x double>* %a, <32 x i16>* %b) #0 {
+define void @fcvtzu_v32f64_v32i16(<32 x double>* %a, <32 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzu_v32f64_v32i16:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_2048-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32
-; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %res = fptoui <32 x double> %op1 to <32 x i16>
   store <32 x i16> %res, <32 x i16>* %b
@@ -703,96 +751,105 @@ define void @fcvtzu_v32f64_v32i16(<32 x double>* %a, <32 x i16>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) #0 {
+define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v1f64_v1i32:
-; CHECK: fcvtzu v0.2d, v0.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) #0 {
+define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v2f64_v2i32:
-; CHECK: fcvtzu v0.2d, v0.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
 
-define <4 x i32> @fcvtzu_v4f64_v4i32(<4 x double>* %a) #0 {
+define <4 x i32> @fcvtzu_v4f64_v4i32(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v4f64_v4i32:
-; CHECK: ptrue [[PG1:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
-; CHECK-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; CHECK-NEXT: uzp1 z0.s, [[CVT]].s, [[CVT]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %res = fptoui <4 x double> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
 
 define void @fcvtzu_v8f64_v8i32(<8 x double>* %a, <8 x i32>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v8f64_v8i32:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_512-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].s, vl8
-; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl4
-; VBITS_EQ_256-DAG: fcvtzu [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d
-; VBITS_EQ_256-DAG: fcvtzu [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d
-; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s
-; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s
-; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: st1w { [[RES]].s }, [[PG4]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d
+; VBITS_GE_256-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    fcvtzu z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v8f64_v8i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.d
+; VBITS_GE_512-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %res = fptoui <8 x double> %op1 to <8 x i32>
   store <8 x i32> %res, <8 x i32>* %b
   ret void
 }
 
-define void @fcvtzu_v16f64_v16i32(<16 x double>* %a, <16 x i32>* %b) #0 {
+define void @fcvtzu_v16f64_v16i32(<16 x double>* %a, <16 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzu_v16f64_v16i32:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_1024-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].s, vl16
-; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %res = fptoui <16 x double> %op1 to <16 x i32>
   store <16 x i32> %res, <16 x i32>* %b
   ret void
 }
 
-define void @fcvtzu_v32f64_v32i32(<32 x double>* %a, <32 x i32>* %b) #0 {
+define void @fcvtzu_v32f64_v32i32(<32 x double>* %a, <32 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzu_v32f64_v32i32:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_2048-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].s, vl32
-; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %res = fptoui <32 x double> %op1 to <32 x i32>
   store <32 x i32> %res, <32 x i32>* %b
@@ -804,31 +861,34 @@ define void @fcvtzu_v32f64_v32i32(<32 x double>* %a, <32 x i32>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) #0 {
+define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v1f64_v1i64:
-; CHECK: fcvtzu x8, d0
-; CHECK: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu x8, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) #0 {
+define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v2f64_v2i64:
-; CHECK: fcvtzu v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
 
-define void @fcvtzu_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) #0 {
+define void @fcvtzu_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v4f64_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %res = fptoui <4 x double> %op1 to <4 x i64>
   store <4 x i64> %res, <4 x i64>* %b
@@ -836,49 +896,53 @@ define void @fcvtzu_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @fcvtzu_v8f64_v8i64(<8 x double>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: fcvtzu_v8f64_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[LO]].d
-; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x1]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    fcvtzu z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzu_v8f64_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %res = fptoui <8 x double> %op1 to <8 x i64>
   store <8 x i64> %res, <8 x i64>* %b
   ret void
 }
 
-define void @fcvtzu_v16f64_v16i64(<16 x double>* %a, <16 x i64>* %b) #0 {
+define void @fcvtzu_v16f64_v16i64(<16 x double>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzu_v16f64_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %res = fptoui <16 x double> %op1 to <16 x i64>
   store <16 x i64> %res, <16 x i64>* %b
   ret void
 }
 
-define void @fcvtzu_v32f64_v32i64(<32 x double>* %a, <32 x i64>* %b) #0 {
+define void @fcvtzu_v32f64_v32i64(<32 x double>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzu_v32f64_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %res = fptoui <32 x double> %op1 to <32 x i64>
   store <32 x i64> %res, <32 x i64>* %b
@@ -890,34 +954,37 @@ define void @fcvtzu_v32f64_v32i64(<32 x double>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) #0 {
+define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v4f16_v4i16:
-; CHECK: fcvtzs v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define void @fcvtzs_v8f16_v8i16(<8 x half>* %a, <8 x i16>* %b) #0 {
+define void @fcvtzs_v8f16_v8i16(<8 x half>* %a, <8 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v8f16_v8i16:
-; CHECK: ldr q0, [x0]
-; CHECK-NEXT: fcvtzs v0.8h, v0.8h
-; CHECK-NEXT: str q0, [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    fcvtzs v0.8h, v0.8h
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <8 x half>, <8 x half>* %a
   %res = fptosi <8 x half> %op1 to <8 x i16>
   store <8 x i16> %res, <8 x i16>* %b
   ret void
 }
 
-define void @fcvtzs_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) #0 {
+define void @fcvtzs_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v16f16_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: fcvtzs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %res = fptosi <16 x half> %op1 to <16 x i16>
   store <16 x i16> %res, <16 x i16>* %b
@@ -925,49 +992,53 @@ define void @fcvtzs_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @fcvtzs_v32f16_v32i16(<32 x half>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v32f16_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fcvtzs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[LO]].h
-; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x1]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v32f16_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fcvtzs z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    fcvtzs z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v32f16_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fcvtzs z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %res = fptosi <32 x half> %op1 to <32 x i16>
   store <32 x i16> %res, <32 x i16>* %b
   ret void
 }
 
-define void @fcvtzs_v64f16_v64i16(<64 x half>* %a, <64 x i16>* %b) #0 {
+define void @fcvtzs_v64f16_v64i16(<64 x half>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzs_v64f16_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fcvtzs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %res = fptosi <64 x half> %op1 to <64 x i16>
   store <64 x i16> %res, <64 x i16>* %b
   ret void
 }
 
-define void @fcvtzs_v128f16_v128i16(<128 x half>* %a, <128 x i16>* %b) #0 {
+define void @fcvtzs_v128f16_v128i16(<128 x half>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzs_v128f16_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %res = fptosi <128 x half> %op1 to <128 x i16>
   store <128 x i16> %res, <128 x i16>* %b
@@ -979,32 +1050,37 @@ define void @fcvtzs_v128f16_v128i16(<128 x half>* %a, <128 x i16>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) #0 {
+define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v2f16_v2i32:
-; CHECK: fcvtl v0.4s, v0.4h
-; CHECK-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) #0 {
+define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v4f16_v4i32:
-; CHECK: fcvtzs v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
 
-define void @fcvtzs_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) #0 {
+define void @fcvtzs_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v8f16_v8i32:
-; CHECK: ldr q[[OP:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].s, z[[OP]].h
-; CHECK-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG]]/m, [[UPK]].h
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <8 x half>, <8 x half>* %a
   %res = fptosi <8 x half> %op1 to <8 x i32>
   store <8 x i32> %res, <8 x i32>* %b
@@ -1012,57 +1088,62 @@ define void @fcvtzs_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @fcvtzs_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v16f16_v16i32:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_512-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: ld1h { [[VEC:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].s, [[VEC]].h
-; VBITS_EQ_256-DAG: ext [[VEC_HI:z[0-9]+]].b, [[VEC]].b, [[VEC]].b, #16
-; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].s, [[VEC_HI]].h
-; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].h
-; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].h
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG2]], [x1]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG2]], [x1, x[[NUMELTS]], lsl #2]
+; VBITS_GE_256-LABEL: fcvtzs_v16f16_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    fcvtzs z1.s, p0/m, z1.h
+; VBITS_GE_256-NEXT:    fcvtzs z0.s, p0/m, z0.h
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v16f16_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    fcvtzs z0.s, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %res = fptosi <16 x half> %op1 to <16 x i32>
   store <16 x i32> %res, <16 x i32>* %b
   ret void
 }
 
-define void @fcvtzs_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) #0 {
+define void @fcvtzs_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzs_v32f16_v32i32:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_1024-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %res = fptosi <32 x half> %op1 to <32 x i32>
   store <32 x i32> %res, <32 x i32>* %b
   ret void
 }
 
-define void @fcvtzs_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) #0 {
+define void @fcvtzs_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzs_v64f16_v64i32:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %res = fptosi <64 x half> %op1 to <64 x i32>
   store <64 x i32> %res, <64 x i32>* %b
@@ -1074,36 +1155,41 @@ define void @fcvtzs_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) #0 {
+define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v1f16_v1i64:
-; CHECK: fcvtzs x8, h0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
   %res = fptosi <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
 
 ; v2f16 is not legal for NEON, so use SVE
-define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) #0 {
+define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v2f16_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z0.h
-; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; CHECK-NEXT: fcvtzs z0.d, [[PG]]/m, [[UPK2]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
 
-define void @fcvtzs_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) #0 {
+define void @fcvtzs_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v4f16_v4i64:
-; CHECK: ldr d[[OP:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[OP]].h
-; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; CHECK-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK2]].h
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <4 x half>, <4 x half>* %a
   %res = fptosi <4 x half> %op1 to <4 x i64>
   store <4 x i64> %res, <4 x i64>* %b
@@ -1111,61 +1197,65 @@ define void @fcvtzs_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @fcvtzs_v8f16_v8i64(<8 x half>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v8f16_v8i64:
-; VBITS_GE_512: ldr q[[OP:[0-9]+]], [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[OP]].h
-; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; VBITS_GE_512-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK2]].h
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ldr q[[OP:[0-9]+]], [x0]
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ext v[[HI:[0-9]+]].16b, v[[LO:[0-9]+]].16b, v[[OP]].16b, #8
-; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[LO]].h
-; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h
-; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s
-; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s
-; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK2_LO]].h
-; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK2_HI]].h
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v8f16_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ldr q0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    fcvtzs z1.d, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v8f16_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ldr q0, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x half>, <8 x half>* %a
   %res = fptosi <8 x half> %op1 to <8 x i64>
   store <8 x i64> %res, <8 x i64>* %b
   ret void
 }
 
-define void @fcvtzs_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) #0 {
+define void @fcvtzs_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzs_v16f16_v16i64:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
-; VBITS_GE_1024-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK2]].h
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x half>, <16 x half>* %a
   %res = fptosi <16 x half> %op1 to <16 x i64>
   store <16 x i64> %res, <16 x i64>* %b
   ret void
 }
 
-define void @fcvtzs_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) #0 {
+define void @fcvtzs_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzs_v32f16_v32i64:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[OP]].h
-; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK]].s
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK2]].h
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x half>, <32 x half>* %a
   %res = fptosi <32 x half> %op1 to <32 x i64>
   store <32 x i64> %res, <32 x i64>* %b
@@ -1177,101 +1267,110 @@ define void @fcvtzs_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) #0 {
+define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v2f32_v2i16:
-; CHECK: fcvtzs v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) #0 {
+define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v4f32_v4i16:
-; CHECK: fcvtzs v1.4s, v0.4s
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov w9, v1.s[2]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.h[1], w8
-; CHECK-NEXT: mov w8, v1.s[3]
-; CHECK-NEXT: mov v0.h[2], w9
-; CHECK-NEXT: mov v0.h[3], w8
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v1.4s, v0.4s
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    mov w9, v1.s[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.h[1], w8
+; CHECK-NEXT:    mov w8, v1.s[3]
+; CHECK-NEXT:    mov v0.h[2], w9
+; CHECK-NEXT:    mov v0.h[3], w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
 
-define <8 x i16> @fcvtzs_v8f32_v8i16(<8 x float>* %a) #0 {
+define <8 x i16> @fcvtzs_v8f32_v8i16(<8 x float>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v8f32_v8i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s
-; CHECK-NEXT: fcvtzs [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s
-; CHECK-NEXT: uzp1 z0.h, [[CVT]].h, [[CVT]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %res = fptosi <8 x float> %op1 to <8 x i16>
   ret <8 x i16> %res
 }
 
 define void @fcvtzs_v16f32_v16i16(<16 x float>* %a, <16 x i16>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v16f32_v16i16:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_512-NEXT: fcvtzs [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl8
-; VBITS_EQ_256-DAG: fcvtzs [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].s
-; VBITS_EQ_256-DAG: fcvtzs [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h
-; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h
-; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h
-; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: st1h { [[RES]].h }, [[PG4]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v16f32_v16i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s
+; VBITS_GE_256-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    fcvtzs z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
+; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v16f32_v16i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.s
+; VBITS_GE_512-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %res = fptosi <16 x float> %op1 to <16 x i16>
   store <16 x i16> %res, <16 x i16>* %b
   ret void
 }
 
-define void @fcvtzs_v32f32_v32i16(<32 x float>* %a, <32 x i16>* %b) #0 {
+define void @fcvtzs_v32f32_v32i16(<32 x float>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzs_v32f32_v32i16:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_1024-NEXT: fcvtzs [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %res = fptosi <32 x float> %op1 to <32 x i16>
   store <32 x i16> %res, <32 x i16>* %b
   ret void
 }
 
-define void @fcvtzs_v64f32_v64i16(<64 x float>* %a, <64 x i16>* %b) #0 {
+define void @fcvtzs_v64f32_v64i16(<64 x float>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzs_v64f32_v64i16:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].s
-; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %res = fptosi <64 x float> %op1 to <64 x i16>
   store <64 x i16> %res, <64 x i16>* %b
@@ -1283,30 +1382,33 @@ define void @fcvtzs_v64f32_v64i16(<64 x float>* %a, <64 x i16>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) #0 {
+define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v2f32_v2i32:
-; CHECK: fcvtzs v0.2s, v0.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) #0 {
+define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v4f32_v4i32:
-; CHECK: fcvtzs v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
 
-define void @fcvtzs_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) #0 {
+define void @fcvtzs_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v8f32_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %res = fptosi <8 x float> %op1 to <8 x i32>
   store <8 x i32> %res, <8 x i32>* %b
@@ -1314,49 +1416,53 @@ define void @fcvtzs_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @fcvtzs_v16f32_v16i32(<16 x float>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v16f32_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[LO]].s
-; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x1]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v16f32_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    fcvtzs z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v16f32_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %res = fptosi <16 x float> %op1 to <16 x i32>
   store <16 x i32> %res, <16 x i32>* %b
   ret void
 }
 
-define void @fcvtzs_v32f32_v32i32(<32 x float>* %a, <32 x i32>* %b) #0 {
+define void @fcvtzs_v32f32_v32i32(<32 x float>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzs_v32f32_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %res = fptosi <32 x float> %op1 to <32 x i32>
   store <32 x i32> %res, <32 x i32>* %b
   ret void
 }
 
-define void @fcvtzs_v64f32_v64i32(<64 x float>* %a, <64 x i32>* %b) #0 {
+define void @fcvtzs_v64f32_v64i32(<64 x float>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzs_v64f32_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %res = fptosi <64 x float> %op1 to <64 x i32>
   store <64 x i32> %res, <64 x i32>* %b
@@ -1368,33 +1474,37 @@ define void @fcvtzs_v64f32_v64i32(<64 x float>* %a, <64 x i32>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) #0 {
+define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v1f32_v1i64:
-; CHECK: fcvtl v0.2d, v0.2s
-; CHECK-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %res = fptosi <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) #0 {
+define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v2f32_v2i64:
-; CHECK: fcvtl v0.2d, v0.2s
-; CHECK-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
 
-define void @fcvtzs_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) #0 {
+define void @fcvtzs_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v4f32_v4i64:
-; CHECK: ldr q[[OP:[0-9]+]], [x0]
-; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[OP]].s
-; CHECK-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK]].s
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <4 x float>, <4 x float>* %a
   %res = fptosi <4 x float> %op1 to <4 x i64>
   store <4 x i64> %res, <4 x i64>* %b
@@ -1402,57 +1512,62 @@ define void @fcvtzs_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @fcvtzs_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v8f32_v8i64:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
-; VBITS_GE_512-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG1]]/m, [[UPK]].s
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: ld1w { [[VEC:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].d, [[VEC]].s
-; VBITS_EQ_256-DAG: ext [[VEC_HI:z[0-9]+]].b, [[VEC]].b, [[VEC]].b, #16
-; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].d, [[VEC]].s
-; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].s
-; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].s
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x1, x[[NUMELTS]], lsl #3]
+; VBITS_GE_256-LABEL: fcvtzs_v8f32_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    fcvtzs z1.d, p0/m, z1.s
+; VBITS_GE_256-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v8f32_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x float>, <8 x float>* %a
   %res = fptosi <8 x float> %op1 to <8 x i64>
   store <8 x i64> %res, <8 x i64>* %b
   ret void
 }
 
-define void @fcvtzs_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) #0 {
+define void @fcvtzs_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzs_v16f32_v16i64:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
-; VBITS_GE_1024-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK]].s
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x float>, <16 x float>* %a
   %res = fptosi <16 x float> %op1 to <16 x i64>
   store <16 x i64> %res, <16 x i64>* %b
   ret void
 }
 
-define void @fcvtzs_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) #0 {
+define void @fcvtzs_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzs_v32f32_v32i64:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl32
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK]].s
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %res = fptosi <32 x float> %op1 to <32 x i64>
   store <32 x i64> %res, <32 x i64>* %b
@@ -1465,98 +1580,110 @@ define void @fcvtzs_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; v1f64 is perfered to be widened to v4f64, so use SVE
-define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) #0 {
+define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v1f64_v1i16:
-; CHECK: ptrue [[PG:p[0-9]+]].d
-; CHECK-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG]]/m, z0.d
-; CHECK-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; CHECK-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) #0 {
+define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v2f64_v2i16:
-; CHECK: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
 
-define <4 x i16> @fcvtzs_v4f64_v4i16(<4 x double>* %a) #0 {
+define <4 x i16> @fcvtzs_v4f64_v4i16(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v4f64_v4i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
-; CHECK-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; CHECK-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; CHECK-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %res = fptosi <4 x double> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
 
 define <8 x i16> @fcvtzs_v8f64_v8i16(<8 x double>* %a) #0 {
-; CHECK-LABEL: fcvtzs_v8f64_v8i16:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_512-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_512-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d
-; VBITS_EQ_256-DAG: fcvtzs [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d
-; VBITS_EQ_256-DAG: fcvtzs [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d
-; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s
-; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h
-; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h
-; VBITS_EQ_256-NEXT: mov v0.d[1], v[[RES_HI]].d[0]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d
+; VBITS_GE_256-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    mov v0.d[1], v2.d[0]
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v8f64_v8i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.d
+; VBITS_GE_512-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %res = fptosi <8 x double> %op1 to <8 x i16>
   ret <8 x i16> %res
 }
 
-define void @fcvtzs_v16f64_v16i16(<16 x double>* %a, <16 x i16>* %b) #0 {
+define void @fcvtzs_v16f64_v16i16(<16 x double>* %a, <16 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzs_v16f64_v16i16:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_1024-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16
-; VBITS_GE_1024-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %res = fptosi <16 x double> %op1 to <16 x i16>
   store <16 x i16> %res, <16 x i16>* %b
   ret void
 }
 
-define void @fcvtzs_v32f64_v32i16(<32 x double>* %a, <32 x i16>* %b) #0 {
+define void @fcvtzs_v32f64_v32i16(<32 x double>* %a, <32 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzs_v32f64_v32i16:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_2048-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32
-; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %res = fptosi <32 x double> %op1 to <32 x i16>
   store <32 x i16> %res, <32 x i16>* %b
@@ -1568,96 +1695,105 @@ define void @fcvtzs_v32f64_v32i16(<32 x double>* %a, <32 x i16>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) #0 {
+define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v1f64_v1i32:
-; CHECK: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) #0 {
+define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v2f64_v2i32:
-; CHECK: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
 
-define <4 x i32> @fcvtzs_v4f64_v4i32(<4 x double>* %a) #0 {
+define <4 x i32> @fcvtzs_v4f64_v4i32(<4 x double>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v4f64_v4i32:
-; CHECK: ptrue [[PG1:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
-; CHECK-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; CHECK-NEXT: uzp1 z0.s, [[CVT]].s, [[CVT]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %res = fptosi <4 x double> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
 
 define void @fcvtzs_v8f64_v8i32(<8 x double>* %a, <8 x i32>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v8f64_v8i32:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_512-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].s, vl8
-; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl4
-; VBITS_EQ_256-DAG: fcvtzs [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d
-; VBITS_EQ_256-DAG: fcvtzs [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d
-; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s
-; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s
-; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: st1w { [[RES]].s }, [[PG4]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d
+; VBITS_GE_256-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v8f64_v8i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.d
+; VBITS_GE_512-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %res = fptosi <8 x double> %op1 to <8 x i32>
   store <8 x i32> %res, <8 x i32>* %b
   ret void
 }
 
-define void @fcvtzs_v16f64_v16i32(<16 x double>* %a, <16 x i32>* %b) #0 {
+define void @fcvtzs_v16f64_v16i32(<16 x double>* %a, <16 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzs_v16f64_v16i32:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_1024-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].s, vl16
-; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %res = fptosi <16 x double> %op1 to <16 x i32>
   store <16 x i32> %res, <16 x i32>* %b
   ret void
 }
 
-define void @fcvtzs_v32f64_v32i32(<32 x double>* %a, <32 x i32>* %b) #0 {
+define void @fcvtzs_v32f64_v32i32(<32 x double>* %a, <32 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzs_v32f64_v32i32:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_2048-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].s, vl32
-; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %res = fptosi <32 x double> %op1 to <32 x i32>
   store <32 x i32> %res, <32 x i32>* %b
@@ -1669,31 +1805,34 @@ define void @fcvtzs_v32f64_v32i32(<32 x double>* %a, <32 x i32>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) #0 {
+define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v1f64_v1i64:
-; CHECK: fcvtzs x8, d0
-; CHECK: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) #0 {
+define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v2f64_v2i64:
-; CHECK: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
 
-define void @fcvtzs_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) #0 {
+define void @fcvtzs_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v4f64_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %res = fptosi <4 x double> %op1 to <4 x i64>
   store <4 x i64> %res, <4 x i64>* %b
@@ -1701,49 +1840,53 @@ define void @fcvtzs_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @fcvtzs_v8f64_v8i64(<8 x double>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: fcvtzs_v8f64_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[LO]].d
-; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x1]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: fcvtzs_v8f64_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, <8 x double>* %a
   %res = fptosi <8 x double> %op1 to <8 x i64>
   store <8 x i64> %res, <8 x i64>* %b
   ret void
 }
 
-define void @fcvtzs_v16f64_v16i64(<16 x double>* %a, <16 x i64>* %b) #0 {
+define void @fcvtzs_v16f64_v16i64(<16 x double>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzs_v16f64_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %res = fptosi <16 x double> %op1 to <16 x i64>
   store <16 x i64> %res, <16 x i64>* %b
   ret void
 }
 
-define void @fcvtzs_v32f64_v32i64(<32 x double>* %a, <32 x i64>* %b) #0 {
+define void @fcvtzs_v32f64_v32i64(<32 x double>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzs_v32f64_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %res = fptosi <32 x double> %op1 to <32 x i64>
   store <32 x i64> %res, <32 x i64>* %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
index 6d16273f98cd..73e6693bb2ff 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
@@ -1,26 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) #0 {
+define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v2.4h, v2.4h, #15
@@ -32,7 +18,7 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) #0 {
+define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
@@ -44,7 +30,7 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask
   ret <8 x half> %sel
 }
 
-define void @select_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @select_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -96,44 +82,16 @@ define void @select_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
   ret void
 }
 
-define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: select_v64f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    mov x10, #32
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z6.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z7.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.h, p0/z, z2.h, z5.h
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z1.h, z4.h
-; VBITS_GE_256-NEXT:    fcmeq p3.h, p0/z, z0.h, z6.h
-; VBITS_GE_256-NEXT:    fcmeq p4.h, p0/z, z3.h, z7.h
-; VBITS_GE_256-NEXT:    sel z0.h, p3, z0.h, z6.h
-; VBITS_GE_256-NEXT:    sel z1.h, p2, z1.h, z4.h
-; VBITS_GE_256-NEXT:    sel z2.h, p1, z2.h, z5.h
-; VBITS_GE_256-NEXT:    sel z3.h, p4, z3.h, z7.h
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: select_v64f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
-; VBITS_GE_1024-NEXT:    sel z0.h, p1, z0.h, z1.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %mask = fcmp oeq <64 x half> %op1, %op2
@@ -142,68 +100,16 @@ define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
   ret void
 }
 
-define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: select_v128f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    mov x10, #32
-; VBITS_GE_256-NEXT:    mov x11, #80
-; VBITS_GE_256-NEXT:    mov x12, #64
-; VBITS_GE_256-NEXT:    mov x13, #112
-; VBITS_GE_256-NEXT:    mov x14, #96
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z7.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1h { z16.h }, p0/z, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z17.h }, p0/z, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z18.h }, p0/z, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z19.h }, p0/z, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z20.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z21.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z22.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z23.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.h, p0/z, z6.h, z17.h
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z5.h, z16.h
-; VBITS_GE_256-NEXT:    fcmeq p3.h, p0/z, z4.h, z19.h
-; VBITS_GE_256-NEXT:    fcmeq p4.h, p0/z, z3.h, z18.h
-; VBITS_GE_256-NEXT:    fcmeq p5.h, p0/z, z2.h, z21.h
-; VBITS_GE_256-NEXT:    fcmeq p6.h, p0/z, z1.h, z20.h
-; VBITS_GE_256-NEXT:    fcmeq p7.h, p0/z, z0.h, z22.h
-; VBITS_GE_256-NEXT:    fcmeq p8.h, p0/z, z7.h, z23.h
-; VBITS_GE_256-NEXT:    sel z0.h, p7, z0.h, z22.h
-; VBITS_GE_256-NEXT:    sel z1.h, p6, z1.h, z20.h
-; VBITS_GE_256-NEXT:    sel z2.h, p5, z2.h, z21.h
-; VBITS_GE_256-NEXT:    sel z3.h, p4, z3.h, z18.h
-; VBITS_GE_256-NEXT:    sel z4.h, p3, z4.h, z19.h
-; VBITS_GE_256-NEXT:    sel z5.h, p2, z5.h, z16.h
-; VBITS_GE_256-NEXT:    sel z6.h, p1, z6.h, z17.h
-; VBITS_GE_256-NEXT:    sel z7.h, p8, z7.h, z23.h
-; VBITS_GE_256-NEXT:    st1h { z6.h }, p0, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z5.h }, p0, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z7.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: select_v128f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
-; VBITS_GE_2048-NEXT:    sel z0.h, p1, z0.h, z1.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v128f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %op2 = load <128 x half>, <128 x half>* %b
   %mask = fcmp oeq <128 x half> %op1, %op2
@@ -213,7 +119,7 @@ define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) #0 {
+define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v2.2s, v2.2s, #31
@@ -225,7 +131,7 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) #0 {
+define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
@@ -237,7 +143,7 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m
   ret <4 x float> %sel
 }
 
-define void @select_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @select_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -289,44 +195,16 @@ define void @select_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: select_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z2.s, z5.s
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z1.s, z4.s
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z0.s, z6.s
-; VBITS_GE_256-NEXT:    fcmeq p4.s, p0/z, z3.s, z7.s
-; VBITS_GE_256-NEXT:    sel z0.s, p3, z0.s, z6.s
-; VBITS_GE_256-NEXT:    sel z1.s, p2, z1.s, z4.s
-; VBITS_GE_256-NEXT:    sel z2.s, p1, z2.s, z5.s
-; VBITS_GE_256-NEXT:    sel z3.s, p4, z3.s, z7.s
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: select_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    sel z0.s, p1, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %mask = fcmp oeq <32 x float> %op1, %op2
@@ -335,68 +213,16 @@ define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
   ret void
 }
 
-define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: select_v64f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x11, #40
-; VBITS_GE_256-NEXT:    mov x12, #32
-; VBITS_GE_256-NEXT:    mov x13, #56
-; VBITS_GE_256-NEXT:    mov x14, #48
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z17.s }, p0/z, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z18.s }, p0/z, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z19.s }, p0/z, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z20.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z21.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z22.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z23.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z6.s, z17.s
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z5.s, z16.s
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z4.s, z19.s
-; VBITS_GE_256-NEXT:    fcmeq p4.s, p0/z, z3.s, z18.s
-; VBITS_GE_256-NEXT:    fcmeq p5.s, p0/z, z2.s, z21.s
-; VBITS_GE_256-NEXT:    fcmeq p6.s, p0/z, z1.s, z20.s
-; VBITS_GE_256-NEXT:    fcmeq p7.s, p0/z, z0.s, z22.s
-; VBITS_GE_256-NEXT:    fcmeq p8.s, p0/z, z7.s, z23.s
-; VBITS_GE_256-NEXT:    sel z0.s, p7, z0.s, z22.s
-; VBITS_GE_256-NEXT:    sel z1.s, p6, z1.s, z20.s
-; VBITS_GE_256-NEXT:    sel z2.s, p5, z2.s, z21.s
-; VBITS_GE_256-NEXT:    sel z3.s, p4, z3.s, z18.s
-; VBITS_GE_256-NEXT:    sel z4.s, p3, z4.s, z19.s
-; VBITS_GE_256-NEXT:    sel z5.s, p2, z5.s, z16.s
-; VBITS_GE_256-NEXT:    sel z6.s, p1, z6.s, z17.s
-; VBITS_GE_256-NEXT:    sel z7.s, p8, z7.s, z23.s
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: select_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_2048-NEXT:    sel z0.s, p1, z0.s, z1.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %op2 = load <64 x float>, <64 x float>* %b
   %mask = fcmp oeq <64 x float> %op1, %op2
@@ -406,7 +232,7 @@ define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) #0 {
+define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tst w0, #0x1
@@ -419,7 +245,7 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1>
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) #0 {
+define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
@@ -431,7 +257,7 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1>
   ret <2 x double> %sel
 }
 
-define void @select_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @select_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -483,44 +309,16 @@ define void @select_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: select_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z2.d, z5.d
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z1.d, z4.d
-; VBITS_GE_256-NEXT:    fcmeq p3.d, p0/z, z0.d, z6.d
-; VBITS_GE_256-NEXT:    fcmeq p4.d, p0/z, z3.d, z7.d
-; VBITS_GE_256-NEXT:    sel z0.d, p3, z0.d, z6.d
-; VBITS_GE_256-NEXT:    sel z1.d, p2, z1.d, z4.d
-; VBITS_GE_256-NEXT:    sel z2.d, p1, z2.d, z5.d
-; VBITS_GE_256-NEXT:    sel z3.d, p4, z3.d, z7.d
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: select_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
-; VBITS_GE_1024-NEXT:    sel z0.d, p1, z0.d, z1.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %mask = fcmp oeq <16 x double> %op1, %op2
@@ -529,68 +327,16 @@ define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
   ret void
 }
 
-define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: select_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    mov x12, #16
-; VBITS_GE_256-NEXT:    mov x13, #28
-; VBITS_GE_256-NEXT:    mov x14, #24
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p0/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p0/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z18.d }, p0/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z19.d }, p0/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z20.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z21.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z22.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z23.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z6.d, z17.d
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z5.d, z16.d
-; VBITS_GE_256-NEXT:    fcmeq p3.d, p0/z, z4.d, z19.d
-; VBITS_GE_256-NEXT:    fcmeq p4.d, p0/z, z3.d, z18.d
-; VBITS_GE_256-NEXT:    fcmeq p5.d, p0/z, z2.d, z21.d
-; VBITS_GE_256-NEXT:    fcmeq p6.d, p0/z, z1.d, z20.d
-; VBITS_GE_256-NEXT:    fcmeq p7.d, p0/z, z0.d, z22.d
-; VBITS_GE_256-NEXT:    fcmeq p8.d, p0/z, z7.d, z23.d
-; VBITS_GE_256-NEXT:    sel z0.d, p7, z0.d, z22.d
-; VBITS_GE_256-NEXT:    sel z1.d, p6, z1.d, z20.d
-; VBITS_GE_256-NEXT:    sel z2.d, p5, z2.d, z21.d
-; VBITS_GE_256-NEXT:    sel z3.d, p4, z3.d, z18.d
-; VBITS_GE_256-NEXT:    sel z4.d, p3, z4.d, z19.d
-; VBITS_GE_256-NEXT:    sel z5.d, p2, z5.d, z16.d
-; VBITS_GE_256-NEXT:    sel z6.d, p1, z6.d, z17.d
-; VBITS_GE_256-NEXT:    sel z7.d, p8, z7.d, z23.d
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: select_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
-; VBITS_GE_2048-NEXT:    sel z0.d, p1, z0.d, z1.d
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %op2 = load <32 x double>, <32 x double>* %b
   %mask = fcmp oeq <32 x double> %op1, %op2
@@ -599,4 +345,4 @@ define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
   ret void
 }
 
-attributes #0 = { "target-features"="+sve" uwtable }
+attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
index 695697ee5930..456d9fe2fd40 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
@@ -1,21 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,49 +10,66 @@ target triple = "aarch64-unknown-linux-gnu"
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @insertelement_v4f16(<4 x half> %op1) #0 {
-; VBITS_GE_256-LABEL: insertelement_v4f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    fmov h1, #5.00000000
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $q0
-; VBITS_GE_256-NEXT:    mov v0.h[3], v1.h[0]
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; VBITS_GE_256-NEXT:    ret
+define <4 x half> @insertelement_v4f16(<4 x half> %op1) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov h1, #5.00000000
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.h[3], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
     %r = insertelement <4 x half> %op1, half 5.0, i64 3
     ret <4 x half> %r
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @insertelement_v8f16(<8 x half> %op1) #0 {
-; VBITS_GE_256-LABEL: insertelement_v8f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    fmov h1, #5.00000000
-; VBITS_GE_256-NEXT:    mov v0.h[7], v1.h[0]
-; VBITS_GE_256-NEXT:    ret
+define <8 x half> @insertelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov h1, #5.00000000
+; CHECK-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-NEXT:    ret
     %r = insertelement <8 x half> %op1, half 5.0, i64 7
     ret <8 x half> %r
 }
 
-define <16 x half> @insertelement_v16f16(<16 x half>* %a) #0 {
-; VBITS_GE_256-LABEL: insertelement_v16f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w9, #15
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fmov h2, #5.00000000
-; VBITS_GE_256-NEXT:    index z3.h, #0, #1
-; VBITS_GE_256-NEXT:    ptrue p1.h
-; VBITS_GE_256-NEXT:    mov z1.h, w9
-; VBITS_GE_256-NEXT:    cmpeq p1.h, p1/z, z3.h, z1.h
-; VBITS_GE_256-NEXT:    mov z0.h, p1/m, h2
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
+define <16 x half> @insertelement_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w9, #15
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fmov h2, #5.00000000
+; CHECK-NEXT:    index z3.h, #0, #1
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    mov z1.h, w9
+; CHECK-NEXT:    cmpeq p1.h, p1/z, z3.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, h2
+; CHECK-NEXT:    st1h { z0.h }, p0, [x8]
+; CHECK-NEXT:    ret
     %op1 = load <16 x half>, <16 x half>* %a
     %r = insertelement <16 x half> %op1, half 5.0, i64 15
     ret <16 x half> %r
 }
 
 define <32 x half> @insertelement_v32f16(<32 x half>* %a) #0 {
+; VBITS_GE_256-LABEL: insertelement_v32f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #16
+; VBITS_GE_256-NEXT:    mov w10, #15
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    fmov h3, #5.00000000
+; VBITS_GE_256-NEXT:    index z4.h, #0, #1
+; VBITS_GE_256-NEXT:    ptrue p1.h
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    mov z2.h, w10
+; VBITS_GE_256-NEXT:    cmpeq p1.h, p1/z, z4.h, z2.h
+; VBITS_GE_256-NEXT:    mov z0.h, p1/m, h3
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: insertelement_v32f16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    mov w9, #31
@@ -85,88 +88,105 @@ define <32 x half> @insertelement_v32f16(<32 x half>* %a) #0 {
     ret <32 x half> %r
 }
 
-define <64 x half> @insertelement_v64f16(<64 x half>* %a) #0 {
-; VBITS_GE_1024-LABEL: insertelement_v64f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov w9, #63
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    fmov h2, #5.00000000
-; VBITS_GE_1024-NEXT:    index z3.h, #0, #1
-; VBITS_GE_1024-NEXT:    ptrue p1.h
-; VBITS_GE_1024-NEXT:    mov z1.h, w9
-; VBITS_GE_1024-NEXT:    cmpeq p1.h, p1/z, z3.h, z1.h
-; VBITS_GE_1024-NEXT:    mov z0.h, p1/m, h2
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x8]
-; VBITS_GE_1024-NEXT:    ret
+define <64 x half> @insertelement_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: insertelement_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w9, #63
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fmov h2, #5.00000000
+; CHECK-NEXT:    index z3.h, #0, #1
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    mov z1.h, w9
+; CHECK-NEXT:    cmpeq p1.h, p1/z, z3.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, h2
+; CHECK-NEXT:    st1h { z0.h }, p0, [x8]
+; CHECK-NEXT:    ret
     %op1 = load <64 x half>, <64 x half>* %a
     %r = insertelement <64 x half> %op1, half 5.0, i64 63
     ret <64 x half> %r
 }
 
-define <128 x half> @insertelement_v128f16(<128 x half>* %a) #0 {
-; VBITS_GE_2048-LABEL: insertelement_v128f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    mov w9, #127
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    fmov h2, #5.00000000
-; VBITS_GE_2048-NEXT:    index z3.h, #0, #1
-; VBITS_GE_2048-NEXT:    ptrue p1.h
-; VBITS_GE_2048-NEXT:    mov z1.h, w9
-; VBITS_GE_2048-NEXT:    cmpeq p1.h, p1/z, z3.h, z1.h
-; VBITS_GE_2048-NEXT:    mov z0.h, p1/m, h2
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <128 x half> @insertelement_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: insertelement_v128f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w9, #127
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    fmov h2, #5.00000000
+; CHECK-NEXT:    index z3.h, #0, #1
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    mov z1.h, w9
+; CHECK-NEXT:    cmpeq p1.h, p1/z, z3.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, h2
+; CHECK-NEXT:    st1h { z0.h }, p0, [x8]
+; CHECK-NEXT:    ret
     %op1 = load <128 x half>, <128 x half>* %a
     %r = insertelement <128 x half> %op1, half 5.0, i64 127
     ret <128 x half> %r
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @insertelement_v2f32(<2 x float> %op1) #0 {
-; VBITS_GE_256-LABEL: insertelement_v2f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    fmov s1, #5.00000000
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $q0
-; VBITS_GE_256-NEXT:    mov v0.s[1], v1.s[0]
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; VBITS_GE_256-NEXT:    ret
+define <2 x float> @insertelement_v2f32(<2 x float> %op1) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s1, #5.00000000
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
     %r = insertelement <2 x float> %op1, float 5.0, i64 1
     ret <2 x float> %r
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @insertelement_v4f32(<4 x float> %op1) #0 {
-; VBITS_GE_256-LABEL: insertelement_v4f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    fmov s1, #5.00000000
-; VBITS_GE_256-NEXT:    mov v0.s[3], v1.s[0]
-; VBITS_GE_256-NEXT:    ret
+define <4 x float> @insertelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s1, #5.00000000
+; CHECK-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NEXT:    ret
     %r = insertelement <4 x float> %op1, float 5.0, i64 3
     ret <4 x float> %r
 }
 
-define <8 x float> @insertelement_v8f32(<8 x float>* %a) #0 {
-; VBITS_GE_256-LABEL: insertelement_v8f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w9, #7
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fmov s2, #5.00000000
-; VBITS_GE_256-NEXT:    index z3.s, #0, #1
-; VBITS_GE_256-NEXT:    ptrue p1.s
-; VBITS_GE_256-NEXT:    mov z1.s, w9
-; VBITS_GE_256-NEXT:    cmpeq p1.s, p1/z, z3.s, z1.s
-; VBITS_GE_256-NEXT:    mov z0.s, p1/m, s2
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
+define <8 x float> @insertelement_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w9, #7
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fmov s2, #5.00000000
+; CHECK-NEXT:    index z3.s, #0, #1
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z1.s, w9
+; CHECK-NEXT:    cmpeq p1.s, p1/z, z3.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, s2
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
     %op1 = load <8 x float>, <8 x float>* %a
     %r = insertelement <8 x float> %op1, float 5.0, i64 7
     ret <8 x float> %r
 }
 
 define <16 x float> @insertelement_v16f32(<16 x float>* %a) #0 {
+; VBITS_GE_256-LABEL: insertelement_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    mov w10, #7
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    fmov s3, #5.00000000
+; VBITS_GE_256-NEXT:    index z4.s, #0, #1
+; VBITS_GE_256-NEXT:    ptrue p1.s
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    mov z2.s, w10
+; VBITS_GE_256-NEXT:    cmpeq p1.s, p1/z, z4.s, z2.s
+; VBITS_GE_256-NEXT:    mov z0.s, p1/m, s3
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: insertelement_v16f32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    mov w9, #15
@@ -185,86 +205,103 @@ define <16 x float> @insertelement_v16f32(<16 x float>* %a) #0 {
     ret <16 x float> %r
 }
 
-define <32 x float> @insertelement_v32f32(<32 x float>* %a) #0 {
-; VBITS_GE_1024-LABEL: insertelement_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov w9, #31
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    fmov s2, #5.00000000
-; VBITS_GE_1024-NEXT:    index z3.s, #0, #1
-; VBITS_GE_1024-NEXT:    ptrue p1.s
-; VBITS_GE_1024-NEXT:    mov z1.s, w9
-; VBITS_GE_1024-NEXT:    cmpeq p1.s, p1/z, z3.s, z1.s
-; VBITS_GE_1024-NEXT:    mov z0.s, p1/m, s2
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_1024-NEXT:    ret
+define <32 x float> @insertelement_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: insertelement_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w9, #31
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fmov s2, #5.00000000
+; CHECK-NEXT:    index z3.s, #0, #1
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z1.s, w9
+; CHECK-NEXT:    cmpeq p1.s, p1/z, z3.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, s2
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
     %op1 = load <32 x float>, <32 x float>* %a
     %r = insertelement <32 x float> %op1, float 5.0, i64 31
     ret <32 x float> %r
 }
 
-define <64 x float> @insertelement_v64f32(<64 x float>* %a) #0 {
-; VBITS_GE_2048-LABEL: insertelement_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    mov w9, #63
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    fmov s2, #5.00000000
-; VBITS_GE_2048-NEXT:    index z3.s, #0, #1
-; VBITS_GE_2048-NEXT:    ptrue p1.s
-; VBITS_GE_2048-NEXT:    mov z1.s, w9
-; VBITS_GE_2048-NEXT:    cmpeq p1.s, p1/z, z3.s, z1.s
-; VBITS_GE_2048-NEXT:    mov z0.s, p1/m, s2
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <64 x float> @insertelement_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: insertelement_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w9, #63
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    fmov s2, #5.00000000
+; CHECK-NEXT:    index z3.s, #0, #1
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z1.s, w9
+; CHECK-NEXT:    cmpeq p1.s, p1/z, z3.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, s2
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
     %op1 = load <64 x float>, <64 x float>* %a
     %r = insertelement <64 x float> %op1, float 5.0, i64 63
     ret <64 x float> %r
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @insertelement_v1f64(<1 x double> %op1) #0 {
-; VBITS_GE_256-LABEL: insertelement_v1f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4617315517961601024
-; VBITS_GE_256-NEXT:    fmov d0, x8
-; VBITS_GE_256-NEXT:    ret
+define <1 x double> @insertelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #4617315517961601024
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
     %r = insertelement <1 x double> %op1, double 5.0, i64 0
     ret <1 x double> %r
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @insertelement_v2f64(<2 x double> %op1) #0 {
-; VBITS_GE_256-LABEL: insertelement_v2f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    fmov d1, #5.00000000
-; VBITS_GE_256-NEXT:    mov v0.d[1], v1.d[0]
-; VBITS_GE_256-NEXT:    ret
+define <2 x double> @insertelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d1, #5.00000000
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
     %r = insertelement <2 x double> %op1, double 5.0, i64 1
     ret <2 x double> %r
 }
 
-define <4 x double> @insertelement_v4f64(<4 x double>* %a) #0 {
-; VBITS_GE_256-LABEL: insertelement_v4f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w9, #3
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fmov d2, #5.00000000
-; VBITS_GE_256-NEXT:    index z3.d, #0, #1
-; VBITS_GE_256-NEXT:    ptrue p1.d
-; VBITS_GE_256-NEXT:    mov z1.d, x9
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p1/z, z3.d, z1.d
-; VBITS_GE_256-NEXT:    mov z0.d, p1/m, d2
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
-; VBITS_GE_256-NEXT:    ret
+define <4 x double> @insertelement_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
+; CHECK-LABEL: insertelement_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w9, #3
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fmov d2, #5.00000000
+; CHECK-NEXT:    index z3.d, #0, #1
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov z1.d, x9
+; CHECK-NEXT:    cmpeq p1.d, p1/z, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, d2
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ret
     %op1 = load <4 x double>, <4 x double>* %a
     %r = insertelement <4 x double> %op1, double 5.0, i64 3
     ret <4 x double> %r
 }
 
 define <8 x double> @insertelement_v8f64(<8 x double>* %a) #0 {
+; VBITS_GE_256-LABEL: insertelement_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    mov w10, #3
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    fmov d3, #5.00000000
+; VBITS_GE_256-NEXT:    index z4.d, #0, #1
+; VBITS_GE_256-NEXT:    ptrue p1.d
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    mov z2.d, x10
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p1/z, z4.d, z2.d
+; VBITS_GE_256-NEXT:    mov z0.d, p1/m, d3
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: insertelement_v8f64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    mov w9, #7
@@ -283,39 +320,39 @@ define <8 x double> @insertelement_v8f64(<8 x double>* %a) #0 {
     ret <8 x double> %r
 }
 
-define <16 x double> @insertelement_v16f64(<16 x double>* %a) #0 {
-; VBITS_GE_1024-LABEL: insertelement_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov w9, #15
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    fmov d2, #5.00000000
-; VBITS_GE_1024-NEXT:    index z3.d, #0, #1
-; VBITS_GE_1024-NEXT:    ptrue p1.d
-; VBITS_GE_1024-NEXT:    mov z1.d, x9
-; VBITS_GE_1024-NEXT:    cmpeq p1.d, p1/z, z3.d, z1.d
-; VBITS_GE_1024-NEXT:    mov z0.d, p1/m, d2
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8]
-; VBITS_GE_1024-NEXT:    ret
+define <16 x double> @insertelement_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: insertelement_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w9, #15
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fmov d2, #5.00000000
+; CHECK-NEXT:    index z3.d, #0, #1
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov z1.d, x9
+; CHECK-NEXT:    cmpeq p1.d, p1/z, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, d2
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ret
     %op1 = load <16 x double>, <16 x double>* %a
     %r = insertelement <16 x double> %op1, double 5.0, i64 15
     ret <16 x double> %r
 }
 
-define <32 x double> @insertelement_v32f64(<32 x double>* %a) #0 {
-; VBITS_GE_2048-LABEL: insertelement_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    mov w9, #31
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    fmov d2, #5.00000000
-; VBITS_GE_2048-NEXT:    index z3.d, #0, #1
-; VBITS_GE_2048-NEXT:    ptrue p1.d
-; VBITS_GE_2048-NEXT:    mov z1.d, x9
-; VBITS_GE_2048-NEXT:    cmpeq p1.d, p1/z, z3.d, z1.d
-; VBITS_GE_2048-NEXT:    mov z0.d, p1/m, d2
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <32 x double> @insertelement_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: insertelement_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w9, #31
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    fmov d2, #5.00000000
+; CHECK-NEXT:    index z3.d, #0, #1
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov z1.d, x9
+; CHECK-NEXT:    cmpeq p1.d, p1/z, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, d2
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ret
     %op1 = load <32 x double>, <32 x double>* %a
     %r = insertelement <32 x double> %op1, double 5.0, i64 31
     ret <32 x double> %r

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
index 1d94566f1a8a..09d7595b205b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
@@ -1,24 +1,7 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=VBITS_EQ_128
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
-
-; VBYTES represents the useful byte size of a vector register from the code
-; generator's point of view. It is clamped to power-of-2 values because
-; only power-of-2 vector lengths are considered legal, regardless of the
-; user specified vector length.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -27,31 +10,34 @@ target triple = "aarch64-unknown-linux-gnu"
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: add_v8i8:
-; CHECK: add v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = add <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: add_v16i8:
-; CHECK: add v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = add <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: add_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    add z0.b, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = add <32 x i8> %op1, %op2
@@ -60,18 +46,28 @@ define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: add_v64i8:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-DAG: add [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_LE_256-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
-; VBITS_LE_256-DAG: add [[RES_1:z[0-9]+]].b, [[OP1_1]].b, [[OP2_1]].b
-; VBITS_LE_256-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
-; CHECK: ret
+; VBITS_GE_256-LABEL: add_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    add z0.b, z0.b, z2.b
+; VBITS_GE_256-NEXT:    add z1.b, z1.b, z3.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: add_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    add z0.b, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = add <64 x i8> %op1, %op2
@@ -79,29 +75,15 @@ define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @add_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @add_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: add_v128i8:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-DAG: add [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_LE_512-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
-; VBITS_LE_512-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
-; VBITS_LE_512-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
-; VBITS_LE_512-DAG: add [[RES_1:z[0-9]+]].b, [[OP1_1]].b, [[OP2_1]].b
-; VBITS_LE_512-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
-; VBITS_LE_256-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]]
-; VBITS_LE_256-DAG: add [[RES_2:z[0-9]+]].b, [[OP1_2]].b, [[OP2_2]].b
-; VBITS_LE_256-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]]
-; VBITS_LE_256-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]]
-; VBITS_LE_256-DAG: add [[RES_3:z[0-9]+]].b, [[OP1_3]].b, [[OP2_3]].b
-; VBITS_LE_256-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    add z0.b, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = add <128 x i8> %op1, %op2
@@ -109,49 +91,15 @@ define void @add_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @add_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @add_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: add_v256i8:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-DAG: add [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_LE_1024-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
-; VBITS_LE_1024-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
-; VBITS_LE_1024-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
-; VBITS_LE_1024-DAG: add [[RES_1:z[0-9]+]].b, [[OP1_1]].b, [[OP2_1]].b
-; VBITS_LE_1024-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
-; VBITS_LE_512-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]]
-; VBITS_LE_512-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]]
-; VBITS_LE_512-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]]
-; VBITS_LE_512-DAG: add [[RES_2:z[0-9]+]].b, [[OP1_2]].b, [[OP2_2]].b
-; VBITS_LE_512-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]]
-; VBITS_LE_512-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]]
-; VBITS_LE_512-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]]
-; VBITS_LE_512-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]]
-; VBITS_LE_512-DAG: add [[RES_3:z[0-9]+]].b, [[OP1_3]].b, [[OP2_3]].b
-; VBITS_LE_512-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]]
-; VBITS_LE_256-DAG: mov w[[OFF_4:[0-9]+]], #[[#mul(VBYTES,4)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_4:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_4]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_4:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_4]]]
-; VBITS_LE_256-DAG: add [[RES_4:z[0-9]+]].b, [[OP1_4]].b, [[OP2_4]].b
-; VBITS_LE_256-DAG: st1b { [[RES_4]].b }, [[PG]], [x0, x[[OFF_4]]]
-; VBITS_LE_256-DAG: mov w[[OFF_5:[0-9]+]], #[[#mul(VBYTES,5)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_5:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_5]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_5:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_5]]]
-; VBITS_LE_256-DAG: add [[RES_5:z[0-9]+]].b, [[OP1_5]].b, [[OP2_5]].b
-; VBITS_LE_256-DAG: st1b { [[RES_5]].b }, [[PG]], [x0, x[[OFF_5]]]
-; VBITS_LE_256-DAG: mov w[[OFF_6:[0-9]+]], #[[#mul(VBYTES,6)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_6:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_6]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_6:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_6]]]
-; VBITS_LE_256-DAG: add [[RES_6:z[0-9]+]].b, [[OP1_6]].b, [[OP2_6]].b
-; VBITS_LE_256-DAG: st1b { [[RES_6]].b }, [[PG]], [x0, x[[OFF_6]]]
-; VBITS_LE_256-DAG: mov w[[OFF_7:[0-9]+]], #[[#mul(VBYTES,7)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_7:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_7]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_7:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_7]]]
-; VBITS_LE_256-DAG: add [[RES_7:z[0-9]+]].b, [[OP1_7]].b, [[OP2_7]].b
-; VBITS_LE_256-DAG: st1b { [[RES_7]].b }, [[PG]], [x0, x[[OFF_7]]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    add z0.b, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = add <256 x i8> %op1, %op2
@@ -160,31 +108,34 @@ define void @add_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: add_v4i16:
-; CHECK: add v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = add <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: add_v8i16:
-; CHECK: add v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = add <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
-define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: add_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    add z0.h, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = add <16 x i16> %op1, %op2
@@ -192,16 +143,29 @@ define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
 define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: add_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: add_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    add z0.h, z0.h, z2.h
+; VBITS_GE_256-NEXT:    add z1.h, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: add_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    add z0.h, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = add <32 x i16> %op1, %op2
@@ -209,16 +173,15 @@ define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
-define void @add_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @add_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: add_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    add z0.h, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = add <64 x i16> %op1, %op2
@@ -226,16 +189,15 @@ define void @add_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
-define void @add_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @add_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: add_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    add z0.h, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = add <128 x i16> %op1, %op2
@@ -244,31 +206,34 @@ define void @add_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: add_v2i32:
-; CHECK: add v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = add <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: add_v4i32:
-; CHECK: add v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = add <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: add_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = add <8 x i32> %op1, %op2
@@ -276,16 +241,29 @@ define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
 define void @add_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: add_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: add_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    add z0.s, z0.s, z2.s
+; VBITS_GE_256-NEXT:    add z1.s, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: add_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    add z0.s, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = add <16 x i32> %op1, %op2
@@ -293,16 +271,15 @@ define void @add_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
-define void @add_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @add_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: add_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = add <32 x i32> %op1, %op2
@@ -310,16 +287,15 @@ define void @add_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
-define void @add_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @add_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: add_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = add <64 x i32> %op1, %op2
@@ -328,31 +304,34 @@ define void @add_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: add_v1i64:
-; CHECK: add d0, d0, d1
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add d0, d0, d1
+; CHECK-NEXT:    ret
   %res = add <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: add_v2i64:
-; CHECK: add v0.2d, v0.2d, v1.2d
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = add <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: add_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = add <4 x i64> %op1, %op2
@@ -360,16 +339,29 @@ define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
 define void @add_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: add_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: add_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    add z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    add z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: add_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    add z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = add <8 x i64> %op1, %op2
@@ -377,16 +369,15 @@ define void @add_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
-define void @add_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @add_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: add_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = add <16 x i64> %op1, %op2
@@ -394,16 +385,20 @@ define void @add_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
-; already cover the general legalisation cases.
-define void @add_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @add_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: add_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: add [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #16
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = add <32 x i64> %op1, %op2
@@ -411,41 +406,39 @@ define void @add_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
   ret void
 }
 
-;
-; NOTE: Tests beyond this point only have CHECK lines to validate the first
-; VBYTES because the add tests already validate the legalisation code paths.
-;
-
 ;
 ; MUL
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: mul_v8i8:
-; CHECK: mul v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = mul <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: mul_v16i8:
-; CHECK: mul v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = mul <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @mul_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @mul_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: mul_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = mul <32 x i8> %op1, %op2
@@ -454,13 +447,28 @@ define void @mul_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @mul_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: mul_v64i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: mul_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT:    mul z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: mul_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    mul z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = mul <64 x i8> %op1, %op2
@@ -468,14 +476,15 @@ define void @mul_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @mul_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @mul_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: mul_v128i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = mul <128 x i8> %op1, %op2
@@ -483,14 +492,15 @@ define void @mul_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @mul_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @mul_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: mul_v256i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = mul <256 x i8> %op1, %op2
@@ -499,31 +509,34 @@ define void @mul_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: mul_v4i16:
-; CHECK: mul v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = mul <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: mul_v8i16:
-; CHECK: mul v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = mul <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
-define void @mul_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @mul_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: mul_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = mul <16 x i16> %op1, %op2
@@ -532,13 +545,28 @@ define void @mul_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @mul_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: mul_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: mul_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    mul z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: mul_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    mul z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = mul <32 x i16> %op1, %op2
@@ -546,14 +574,15 @@ define void @mul_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @mul_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @mul_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: mul_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = mul <64 x i16> %op1, %op2
@@ -561,14 +590,15 @@ define void @mul_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @mul_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @mul_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: mul_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = mul <128 x i16> %op1, %op2
@@ -577,31 +607,34 @@ define void @mul_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: mul_v2i32:
-; CHECK: mul v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = mul <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: mul_v4i32:
-; CHECK: mul v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = mul <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @mul_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @mul_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: mul_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = mul <8 x i32> %op1, %op2
@@ -610,13 +643,28 @@ define void @mul_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @mul_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: mul_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: mul_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    mul z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: mul_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = mul <16 x i32> %op1, %op2
@@ -624,14 +672,15 @@ define void @mul_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @mul_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @mul_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: mul_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = mul <32 x i32> %op1, %op2
@@ -639,14 +688,15 @@ define void @mul_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @mul_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @mul_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: mul_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = mul <64 x i32> %op1, %op2
@@ -656,42 +706,39 @@ define void @mul_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 
 define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 ; CHECK-LABEL: mul_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: mul_v1i64:
-; VBITS_EQ_128:         ptrue p0.d, vl1
-; VBITS_EQ_128:         mul z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128:         ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = mul <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; CHECK-LABEL: mul_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: mul_v2i64:
-; VBITS_EQ_128:         ptrue p0.d, vl2
-; VBITS_EQ_128:         mul z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128:         ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = mul <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @mul_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @mul_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: mul_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = mul <4 x i64> %op1, %op2
@@ -700,13 +747,28 @@ define void @mul_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @mul_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: mul_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: mul_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    mul z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: mul_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = mul <8 x i64> %op1, %op2
@@ -714,14 +776,15 @@ define void @mul_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @mul_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @mul_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: mul_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = mul <16 x i64> %op1, %op2
@@ -729,14 +792,15 @@ define void @mul_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @mul_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @mul_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: mul_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = mul <32 x i64> %op1, %op2
@@ -749,31 +813,34 @@ define void @mul_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sub_v8i8:
-; CHECK: sub v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = sub <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sub_v16i8:
-; CHECK: sub v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = sub <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @sub_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @sub_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sub_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    sub z0.b, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = sub <32 x i8> %op1, %op2
@@ -782,13 +849,28 @@ define void @sub_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @sub_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: sub_v64i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: sub_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sub z0.b, z0.b, z2.b
+; VBITS_GE_256-NEXT:    sub z1.b, z1.b, z3.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sub_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    sub z0.b, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = sub <64 x i8> %op1, %op2
@@ -796,14 +878,15 @@ define void @sub_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @sub_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @sub_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sub_v128i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    sub z0.b, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = sub <128 x i8> %op1, %op2
@@ -811,14 +894,15 @@ define void @sub_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @sub_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @sub_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sub_v256i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    sub z0.b, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = sub <256 x i8> %op1, %op2
@@ -827,31 +911,34 @@ define void @sub_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sub_v4i16:
-; CHECK: sub v0.4h, v0.4h, v1.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = sub <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sub_v8i16:
-; CHECK: sub v0.8h, v0.8h, v1.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = sub <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
-define void @sub_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @sub_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sub_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    sub z0.h, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = sub <16 x i16> %op1, %op2
@@ -860,13 +947,28 @@ define void @sub_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @sub_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: sub_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: sub_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sub z0.h, z0.h, z2.h
+; VBITS_GE_256-NEXT:    sub z1.h, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sub_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    sub z0.h, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = sub <32 x i16> %op1, %op2
@@ -874,14 +976,15 @@ define void @sub_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @sub_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @sub_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sub_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    sub z0.h, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = sub <64 x i16> %op1, %op2
@@ -889,14 +992,15 @@ define void @sub_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @sub_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @sub_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sub_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    sub z0.h, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = sub <128 x i16> %op1, %op2
@@ -905,31 +1009,34 @@ define void @sub_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sub_v2i32:
-; CHECK: sub v0.2s, v0.2s, v1.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = sub <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sub_v4i32:
-; CHECK: sub v0.4s, v0.4s, v1.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = sub <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @sub_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @sub_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sub_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    sub z0.s, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = sub <8 x i32> %op1, %op2
@@ -938,13 +1045,28 @@ define void @sub_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @sub_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: sub_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: sub_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sub z0.s, z0.s, z2.s
+; VBITS_GE_256-NEXT:    sub z1.s, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sub_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    sub z0.s, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = sub <16 x i32> %op1, %op2
@@ -952,14 +1074,15 @@ define void @sub_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @sub_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @sub_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sub_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    sub z0.s, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = sub <32 x i32> %op1, %op2
@@ -967,14 +1090,15 @@ define void @sub_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @sub_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @sub_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sub_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    sub z0.s, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = sub <64 x i32> %op1, %op2
@@ -983,31 +1107,34 @@ define void @sub_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sub_v1i64:
-; CHECK: sub d0, d0, d1
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub d0, d0, d1
+; CHECK-NEXT:    ret
   %res = sub <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sub_v2i64:
-; CHECK: sub v0.2d, v0.2d, v1.2d
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = sub <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @sub_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @sub_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sub_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    sub z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = sub <4 x i64> %op1, %op2
@@ -1016,13 +1143,28 @@ define void @sub_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @sub_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: sub_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: sub_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sub z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    sub z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sub_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    sub z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = sub <8 x i64> %op1, %op2
@@ -1030,14 +1172,15 @@ define void @sub_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @sub_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @sub_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sub_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    sub z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = sub <16 x i64> %op1, %op2
@@ -1045,14 +1188,15 @@ define void @sub_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @sub_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @sub_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sub_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: sub [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    sub z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = sub <32 x i64> %op1, %op2
@@ -1066,30 +1210,33 @@ define void @sub_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @abs_v8i8(<8 x i8> %op1) #0 {
+define <8 x i8> @abs_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v8i8:
-; CHECK: abs v0.8b, v0.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    abs v0.8b, v0.8b
+; CHECK-NEXT:    ret
   %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false)
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @abs_v16i8(<16 x i8> %op1) #0 {
+define <16 x i8> @abs_v16i8(<16 x i8> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v16i8:
-; CHECK: abs v0.16b, v0.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    abs v0.16b, v0.16b
+; CHECK-NEXT:    ret
   %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false)
   ret <16 x i8> %res
 }
 
-define void @abs_v32i8(<32 x i8>* %a) #0 {
+define void @abs_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    abs z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false)
   store <32 x i8> %res, <32 x i8>* %a
@@ -1097,38 +1244,53 @@ define void @abs_v32i8(<32 x i8>* %a) #0 {
 }
 
 define void @abs_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: abs_v64i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: abs_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    abs z0.b, p0/m, z0.b
+; VBITS_GE_256-NEXT:    abs z1.b, p0/m, z1.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: abs_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    abs z0.b, p0/m, z0.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %res = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %op1, i1 false)
   store <64 x i8> %res, <64 x i8>* %a
   ret void
 }
 
-define void @abs_v128i8(<128 x i8>* %a) #0 {
+define void @abs_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: abs_v128i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    abs z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %res = call <128 x i8> @llvm.abs.v128i8(<128 x i8> %op1, i1 false)
   store <128 x i8> %res, <128 x i8>* %a
   ret void
 }
 
-define void @abs_v256i8(<256 x i8>* %a) #0 {
+define void @abs_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: abs_v256i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    abs z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %res = call <256 x i8> @llvm.abs.v256i8(<256 x i8> %op1, i1 false)
   store <256 x i8> %res, <256 x i8>* %a
@@ -1136,69 +1298,119 @@ define void @abs_v256i8(<256 x i8>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @abs_v4i16(<4 x i16> %op1) #0 {
+define <4 x i16> @abs_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v4i16:
-; CHECK: abs v0.4h, v0.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    abs v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false)
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @abs_v8i16(<8 x i16> %op1) #0 {
+define <8 x i16> @abs_v8i16(<8 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v8i16:
-; CHECK: abs v0.8h, v0.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    abs v0.8h, v0.8h
+; CHECK-NEXT:    ret
   %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false)
   ret <8 x i16> %res
 }
 
-define void @abs_v16i16(<16 x i16>* %a) #0 {
+define void @abs_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    abs z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false)
   store <16 x i16> %res, <16 x i16>* %a
   ret void
 }
 
-define void @abs_v32i16(<32 x i16>* %a) #0 {
+define void @abs_v32i16(<32 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #16
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    abs z0.h, p0/m, z0.h
+; CHECK-NEXT:    abs z1.h, p0/m, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %op1, i1 false)
   store <32 x i16> %res, <32 x i16>* %a
   ret void
 }
 
-define void @abs_v64i16(<64 x i16>* %a) #0 {
+define void @abs_v64i16(<64 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #48
+; CHECK-NEXT:    mov x9, #16
+; CHECK-NEXT:    mov x10, #32
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
+; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x0]
+; CHECK-NEXT:    abs z1.h, p0/m, z1.h
+; CHECK-NEXT:    abs z0.h, p0/m, z0.h
+; CHECK-NEXT:    abs z2.h, p0/m, z2.h
+; CHECK-NEXT:    abs z3.h, p0/m, z3.h
+; CHECK-NEXT:    st1h { z2.h }, p0, [x0, x10, lsl #1]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0, x9, lsl #1]
+; CHECK-NEXT:    st1h { z3.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %res = call <64 x i16> @llvm.abs.v64i16(<64 x i16> %op1, i1 false)
   store <64 x i16> %res, <64 x i16>* %a
   ret void
 }
 
-define void @abs_v128i16(<128 x i16>* %a) #0 {
+define void @abs_v128i16(<128 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #96
+; CHECK-NEXT:    mov x9, #48
+; CHECK-NEXT:    mov x10, #16
+; CHECK-NEXT:    mov x11, #80
+; CHECK-NEXT:    mov x12, #32
+; CHECK-NEXT:    mov x13, #112
+; CHECK-NEXT:    mov x14, #64
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
+; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x0, x12, lsl #1]
+; CHECK-NEXT:    ld1h { z4.h }, p0/z, [x0, x13, lsl #1]
+; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x0, x14, lsl #1]
+; CHECK-NEXT:    ld1h { z6.h }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    ld1h { z7.h }, p0/z, [x0]
+; CHECK-NEXT:    abs z1.h, p0/m, z1.h
+; CHECK-NEXT:    abs z0.h, p0/m, z0.h
+; CHECK-NEXT:    abs z3.h, p0/m, z3.h
+; CHECK-NEXT:    abs z2.h, p0/m, z2.h
+; CHECK-NEXT:    abs z5.h, p0/m, z5.h
+; CHECK-NEXT:    abs z4.h, p0/m, z4.h
+; CHECK-NEXT:    abs z6.h, p0/m, z6.h
+; CHECK-NEXT:    abs z7.h, p0/m, z7.h
+; CHECK-NEXT:    st1h { z6.h }, p0, [x0, x8, lsl #1]
+; CHECK-NEXT:    st1h { z4.h }, p0, [x0, x13, lsl #1]
+; CHECK-NEXT:    st1h { z5.h }, p0, [x0, x14, lsl #1]
+; CHECK-NEXT:    st1h { z2.h }, p0, [x0, x11, lsl #1]
+; CHECK-NEXT:    st1h { z3.h }, p0, [x0, x12, lsl #1]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x9, lsl #1]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0, x10, lsl #1]
+; CHECK-NEXT:    st1h { z7.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %res = call <128 x i16> @llvm.abs.v128i16(<128 x i16> %op1, i1 false)
   store <128 x i16> %res, <128 x i16>* %a
@@ -1206,30 +1418,33 @@ define void @abs_v128i16(<128 x i16>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @abs_v2i32(<2 x i32> %op1) #0 {
+define <2 x i32> @abs_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v2i32:
-; CHECK: abs v0.2s, v0.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    abs v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @abs_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @abs_v4i32(<4 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v4i32:
-; CHECK: abs v0.4s, v0.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    abs v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
   ret <4 x i32> %res
 }
 
-define void @abs_v8i32(<8 x i32>* %a) #0 {
+define void @abs_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    abs z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
   store <8 x i32> %res, <8 x i32>* %a
@@ -1237,38 +1452,53 @@ define void @abs_v8i32(<8 x i32>* %a) #0 {
 }
 
 define void @abs_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: abs_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: abs_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    abs z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    abs z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: abs_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    abs z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %res = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %op1, i1 false)
   store <16 x i32> %res, <16 x i32>* %a
   ret void
 }
 
-define void @abs_v32i32(<32 x i32>* %a) #0 {
+define void @abs_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: abs_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    abs z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %res = call <32 x i32> @llvm.abs.v32i32(<32 x i32> %op1, i1 false)
   store <32 x i32> %res, <32 x i32>* %a
   ret void
 }
 
-define void @abs_v64i32(<64 x i32>* %a) #0 {
+define void @abs_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: abs_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    abs z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %res = call <64 x i32> @llvm.abs.v64i32(<64 x i32> %op1, i1 false)
   store <64 x i32> %res, <64 x i32>* %a
@@ -1276,30 +1506,33 @@ define void @abs_v64i32(<64 x i32>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @abs_v1i64(<1 x i64> %op1) #0 {
+define <1 x i64> @abs_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v1i64:
-; CHECK: abs d0, d0
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    abs d0, d0
+; CHECK-NEXT:    ret
   %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false)
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @abs_v2i64(<2 x i64> %op1) #0 {
+define <2 x i64> @abs_v2i64(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v2i64:
-; CHECK: abs v0.2d, v0.2d
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    abs v0.2d, v0.2d
+; CHECK-NEXT:    ret
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
   ret <2 x i64> %res
 }
 
-define void @abs_v4i64(<4 x i64>* %a) #0 {
+define void @abs_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    abs z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
   store <4 x i64> %res, <4 x i64>* %a
@@ -1307,38 +1540,53 @@ define void @abs_v4i64(<4 x i64>* %a) #0 {
 }
 
 define void @abs_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: abs_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: abs_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    abs z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    abs z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: abs_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    abs z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %res = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %op1, i1 false)
   store <8 x i64> %res, <8 x i64>* %a
   ret void
 }
 
-define void @abs_v16i64(<16 x i64>* %a) #0 {
+define void @abs_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: abs_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    abs z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %res = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %op1, i1 false)
   store <16 x i64> %res, <16 x i64>* %a
   ret void
 }
 
-define void @abs_v32i64(<32 x i64>* %a) #0 {
+define void @abs_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: abs_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK: abs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    abs z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %res = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %op1, i1 false)
   store <32 x i64> %res, <32 x i64>* %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll
index 9cecfbe40b74..29b9392c77d0 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll
@@ -1,58 +1,46 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: z{0-9}
-
 ;
 ; ICMP EQ
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_eq_v8i8:
-; CHECK: cmeq v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %cmp = icmp eq <8 x i8> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i8>
   ret <8 x i8> %sext
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_eq_v16i8:
-; CHECK: cmeq v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %cmp = icmp eq <16 x i8> %op1, %op2
   %sext = sext <16 x i1> %cmp to <16 x i8>
   ret <16 x i8> %sext
 }
 
-define void @icmp_eq_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @icmp_eq_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_eq_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
-; CHECK-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %cmp = icmp eq <32 x i8> %op1, %op2
@@ -62,29 +50,31 @@ define void @icmp_eq_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @icmp_eq_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: icmp_eq_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].b, [[PG]]/z, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].b, [[PG]]/z, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].b, [[CMP_LO]]/z, #-1
-; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].b, [[CMP_HI]]/z, #-1
-; VBITS_EQ_256-DAG: st1b { [[SEXT_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[SEXT_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: icmp_eq_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.b, p0/z, z0.b, z2.b
+; VBITS_GE_256-NEXT:    cmpeq p2.b, p0/z, z1.b, z3.b
+; VBITS_GE_256-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.b, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: icmp_eq_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpeq p1.b, p0/z, z0.b, z1.b
+; VBITS_GE_512-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %cmp = icmp eq <64 x i8> %op1, %op2
@@ -93,15 +83,16 @@ define void @icmp_eq_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @icmp_eq_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @icmp_eq_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: icmp_eq_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
-; VBITS_GE_1024-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %cmp = icmp eq <128 x i8> %op1, %op2
@@ -110,15 +101,16 @@ define void @icmp_eq_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @icmp_eq_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @icmp_eq_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: icmp_eq_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
-; VBITS_GE_2048-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %cmp = icmp eq <256 x i8> %op1, %op2
@@ -128,34 +120,37 @@ define void @icmp_eq_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_eq_v4i16:
-; CHECK: cmeq v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %cmp = icmp eq <4 x i16> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
   ret <4 x i16> %sext
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_eq_v8i16:
-; CHECK: cmeq v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %cmp = icmp eq <8 x i16> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
   ret <8 x i16> %sext
 }
 
-define void @icmp_eq_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @icmp_eq_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_eq_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
-; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %cmp = icmp eq <16 x i16> %op1, %op2
@@ -165,29 +160,31 @@ define void @icmp_eq_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @icmp_eq_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: icmp_eq_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].h, [[PG]]/z, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].h, [[PG]]/z, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].h, [[CMP_LO]]/z, #-1
-; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].h, [[CMP_HI]]/z, #-1
-; VBITS_EQ_256-DAG: st1h { [[SEXT_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[SEXT_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: icmp_eq_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z0.h, z2.h
+; VBITS_GE_256-NEXT:    cmpeq p2.h, p0/z, z1.h, z3.h
+; VBITS_GE_256-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.h, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: icmp_eq_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.h
+; VBITS_GE_512-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %cmp = icmp eq <32 x i16> %op1, %op2
@@ -196,15 +193,16 @@ define void @icmp_eq_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @icmp_eq_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @icmp_eq_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: icmp_eq_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
-; VBITS_GE_1024-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %cmp = icmp eq <64 x i16> %op1, %op2
@@ -213,15 +211,16 @@ define void @icmp_eq_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @icmp_eq_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @icmp_eq_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: icmp_eq_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
-; VBITS_GE_2048-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %cmp = icmp eq <128 x i16> %op1, %op2
@@ -231,34 +230,37 @@ define void @icmp_eq_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_eq_v2i32:
-; CHECK: cmeq v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %cmp = icmp eq <2 x i32> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
   ret <2 x i32> %sext
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_eq_v4i32:
-; CHECK: cmeq v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %cmp = icmp eq <4 x i32> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
   ret <4 x i32> %sext
 }
 
-define void @icmp_eq_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @icmp_eq_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_eq_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
-; CHECK-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %cmp = icmp eq <8 x i32> %op1, %op2
@@ -268,29 +270,31 @@ define void @icmp_eq_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @icmp_eq_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: icmp_eq_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].s, [[PG]]/z, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].s, [[PG]]/z, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].s, [[CMP_LO]]/z, #-1
-; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].s, [[CMP_HI]]/z, #-1
-; VBITS_EQ_256-DAG: st1w { [[SEXT_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[SEXT_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: icmp_eq_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, z2.s
+; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, z3.s
+; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: icmp_eq_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
+; VBITS_GE_512-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %cmp = icmp eq <16 x i32> %op1, %op2
@@ -299,15 +303,16 @@ define void @icmp_eq_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @icmp_eq_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @icmp_eq_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: icmp_eq_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
-; VBITS_GE_1024-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %cmp = icmp eq <32 x i32> %op1, %op2
@@ -316,15 +321,16 @@ define void @icmp_eq_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @icmp_eq_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @icmp_eq_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: icmp_eq_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
-; VBITS_GE_2048-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %cmp = icmp eq <64 x i32> %op1, %op2
@@ -334,34 +340,37 @@ define void @icmp_eq_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_eq_v1i64:
-; CHECK: cmeq d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq d0, d0, d1
+; CHECK-NEXT:    ret
   %cmp = icmp eq <1 x i64> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
   ret <1 x i64> %sext
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_eq_v2i64:
-; CHECK: cmeq v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %cmp = icmp eq <2 x i64> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
   ret <2 x i64> %sext
 }
 
-define void @icmp_eq_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @icmp_eq_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_eq_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; CHECK-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %cmp = icmp eq <4 x i64> %op1, %op2
@@ -371,29 +380,31 @@ define void @icmp_eq_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @icmp_eq_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: icmp_eq_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].d, [[PG]]/z, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].d, [[PG]]/z, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].d, [[CMP_LO]]/z, #-1
-; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].d, [[CMP_HI]]/z, #-1
-; VBITS_EQ_256-DAG: st1d { [[SEXT_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[SEXT_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: icmp_eq_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, z2.d
+; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, z3.d
+; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: icmp_eq_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
+; VBITS_GE_512-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %cmp = icmp eq <8 x i64> %op1, %op2
@@ -402,15 +413,16 @@ define void @icmp_eq_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @icmp_eq_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @icmp_eq_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: icmp_eq_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; VBITS_GE_1024-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %cmp = icmp eq <16 x i64> %op1, %op2
@@ -419,15 +431,16 @@ define void @icmp_eq_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @icmp_eq_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @icmp_eq_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: icmp_eq_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; VBITS_GE_2048-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %cmp = icmp eq <32 x i64> %op1, %op2
@@ -440,15 +453,16 @@ define void @icmp_eq_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ; ICMP NE
 ;
 
-define void @icmp_ne_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @icmp_ne_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_ne_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpne [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
-; CHECK-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %cmp = icmp ne <32 x i8> %op1, %op2
@@ -461,15 +475,16 @@ define void @icmp_ne_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 ; ICMP SGE
 ;
 
-define void @icmp_sge_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+define void @icmp_sge_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(4,0) #0 {
 ; CHECK-LABEL: icmp_sge_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmpge [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpge p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %cmp = icmp sge <32 x i16> %op1, %op2
@@ -482,15 +497,16 @@ define void @icmp_sge_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
 ; ICMP SGT
 ;
 
-define void @icmp_sgt_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @icmp_sgt_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_sgt_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpgt [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
-; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpgt p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %cmp = icmp sgt <16 x i16> %op1, %op2
@@ -503,15 +519,16 @@ define void @icmp_sgt_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 ; ICMP SLE
 ;
 
-define void @icmp_sle_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
+define void @icmp_sle_v16i32(<16 x i32>* %a, <16 x i32>* %b) vscale_range(4,0) #0 {
 ; CHECK-LABEL: icmp_sle_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmpge [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP2]].s, [[OP1]].s
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpge p1.s, p0/z, z1.s, z0.s
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %cmp = icmp sle <16 x i32> %op1, %op2
@@ -524,15 +541,16 @@ define void @icmp_sle_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
 ; ICMP SLT
 ;
 
-define void @icmp_slt_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @icmp_slt_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_slt_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmpgt [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP2]].s, [[OP1]].s
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
-; CHECK-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpgt p1.s, p0/z, z1.s, z0.s
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %cmp = icmp slt <8 x i32> %op1, %op2
@@ -545,15 +563,16 @@ define void @icmp_slt_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 ; ICMP UGE
 ;
 
-define void @icmp_uge_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
+define void @icmp_uge_v8i64(<8 x i64>* %a, <8 x i64>* %b) vscale_range(4,0) #0 {
 ; CHECK-LABEL: icmp_uge_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: cmphs [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; VBITS_GE_512-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl8
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmphs p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %cmp = icmp uge <8 x i64> %op1, %op2
@@ -566,15 +585,16 @@ define void @icmp_uge_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
 ; ICMP UGT
 ;
 
-define void @icmp_ugt_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @icmp_ugt_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: icmp_ugt_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: cmphi [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; CHECK-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmphi p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %cmp = icmp ugt <4 x i64> %op1, %op2
@@ -587,15 +607,16 @@ define void @icmp_ugt_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 ; ICMP ULE
 ;
 
-define void @icmp_ule_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @icmp_ule_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: icmp_ule_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: cmphs [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP2]].d, [[OP1]].d
-; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; VBITS_GE_1024-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmphs p1.d, p0/z, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %cmp = icmp ule <16 x i64> %op1, %op2
@@ -608,15 +629,16 @@ define void @icmp_ule_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
 ; ICMP ULT
 ;
 
-define void @icmp_ult_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @icmp_ult_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: icmp_ult_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: cmphi [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP2]].d, [[OP1]].d
-; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
-; VBITS_GE_2048-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmphi p1.d, p0/z, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %cmp = icmp ult <32 x i64> %op1, %op2

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
index 8b1bae5009a1..9c1e9577df16 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
@@ -1,19 +1,8 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=VBITS_EQ_128
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048,VBITS_EQ_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,164 +13,164 @@ target triple = "aarch64-unknown-linux-gnu"
 ; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
-; CHECK-LABEL: sdiv_v8i8:
-; CHECK: ptrue [[PG0:p[0-9]+]].s, vl8
-; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, z1.b
-; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, z0.b
-; CHECK-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
-; CHECK-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
-; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG0]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[DIV]].h, [[DIV]].h
-; CHECK-NEXT: umov [[SCALAR0:w[0-9]+]], [[VEC:v[0-9]+]].h[0]
-; CHECK-NEXT: umov [[SCALAR1:w[0-9]+]], [[VEC]].h[1]
-; CHECK-NEXT: fmov s0, [[SCALAR0]]
-; CHECK-NEXT: umov [[SCALAR2:w[0-9]+]], [[VEC]].h[2]
-; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR1]]
-; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR2]]
-; CHECK-NEXT: umov [[SCALAR3:w[0-9]+]], [[VEC]].h[3]
-; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR3]]
-; CHECK-NEXT: umov [[SCALAR4:w[0-9]+]], [[VEC]].h[4]
-; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR4]]
-; CHECK-NEXT: umov [[SCALAR5:w[0-9]+]], [[VEC]].h[5]
-; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR5]]
-; CHECK-NEXT: umov [[SCALAR6:w[0-9]+]], [[VEC]].h[6]
-; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR6]]
-; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7]
-; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]]
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v8i8:
-; VBITS_EQ_128:         sshll v1.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    sshll v0.8h, v0.8b, #0
-; VBITS_EQ_128-NEXT:    sunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT:    sunpklo z1.s, z1.h
-; VBITS_EQ_128-NEXT:    sunpkhi z3.s, z0.h
-; VBITS_EQ_128-NEXT:    sunpklo z0.s, z0.h
-; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT:    uzp1 z0.h, z0.h, z2.h
-; VBITS_EQ_128-NEXT:    xtn v0.8b, v0.8h
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: sdiv_v8i8:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    sshll v1.8h, v1.8b, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sshll v0.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    sunpkhi z2.s, z1.h
+; VBITS_GE_128-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT:    sunpkhi z3.s, z0.h
+; VBITS_GE_128-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_128-NEXT:    xtn v0.8b, v0.8h
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: sdiv_v8i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    umov w8, v1.h[0]
+; VBITS_GE_256-NEXT:    umov w9, v1.h[1]
+; VBITS_GE_256-NEXT:    fmov s0, w8
+; VBITS_GE_256-NEXT:    umov w8, v1.h[2]
+; VBITS_GE_256-NEXT:    mov v0.b[1], w9
+; VBITS_GE_256-NEXT:    mov v0.b[2], w8
+; VBITS_GE_256-NEXT:    umov w8, v1.h[3]
+; VBITS_GE_256-NEXT:    mov v0.b[3], w8
+; VBITS_GE_256-NEXT:    umov w8, v1.h[4]
+; VBITS_GE_256-NEXT:    mov v0.b[4], w8
+; VBITS_GE_256-NEXT:    umov w8, v1.h[5]
+; VBITS_GE_256-NEXT:    mov v0.b[5], w8
+; VBITS_GE_256-NEXT:    umov w8, v1.h[6]
+; VBITS_GE_256-NEXT:    mov v0.b[6], w8
+; VBITS_GE_256-NEXT:    umov w8, v1.h[7]
+; VBITS_GE_256-NEXT:    mov v0.b[7], w8
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sdiv_v8i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    uzp1 z1.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    umov w8, v1.h[0]
+; VBITS_GE_512-NEXT:    umov w9, v1.h[1]
+; VBITS_GE_512-NEXT:    fmov s0, w8
+; VBITS_GE_512-NEXT:    umov w8, v1.h[2]
+; VBITS_GE_512-NEXT:    mov v0.b[1], w9
+; VBITS_GE_512-NEXT:    mov v0.b[2], w8
+; VBITS_GE_512-NEXT:    umov w8, v1.h[3]
+; VBITS_GE_512-NEXT:    mov v0.b[3], w8
+; VBITS_GE_512-NEXT:    umov w8, v1.h[4]
+; VBITS_GE_512-NEXT:    mov v0.b[4], w8
+; VBITS_GE_512-NEXT:    umov w8, v1.h[5]
+; VBITS_GE_512-NEXT:    mov v0.b[5], w8
+; VBITS_GE_512-NEXT:    umov w8, v1.h[6]
+; VBITS_GE_512-NEXT:    mov v0.b[6], w8
+; VBITS_GE_512-NEXT:    umov w8, v1.h[7]
+; VBITS_GE_512-NEXT:    mov v0.b[7], w8
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_512-NEXT:    ret
   %res = sdiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
-; CHECK-LABEL: sdiv_v16i8:
-
-; HALF VECTOR:
-; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, z1.b
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, z0.b
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, z1.h
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, z0.h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
-; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_256-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, z1.b
-; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, z0.b
-; VBITS_GE_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
-; VBITS_GE_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
-; VBITS_GE_512-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV]].h, [[DIV]].h
-; VBITS_GE_512-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v16i8:
-; VBITS_EQ_128:         sunpkhi z2.h, z1.b
-; VBITS_EQ_128-NEXT:    sunpkhi z3.h, z0.b
-; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    sunpklo z1.h, z1.b
-; VBITS_EQ_128-NEXT:    sunpkhi z4.s, z2.h
-; VBITS_EQ_128-NEXT:    sunpkhi z5.s, z3.h
-; VBITS_EQ_128-NEXT:    sunpklo z2.s, z2.h
-; VBITS_EQ_128-NEXT:    sunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT:    sunpklo z0.h, z0.b
-; VBITS_EQ_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    sunpkhi z3.s, z1.h
-; VBITS_EQ_128-NEXT:    sunpkhi z5.s, z0.h
-; VBITS_EQ_128-NEXT:    sunpklo z1.s, z1.h
-; VBITS_EQ_128-NEXT:    sunpklo z0.s, z0.h
-; VBITS_EQ_128-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
-; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT:    uzp1 z1.h, z2.h, z4.h
-; VBITS_EQ_128-NEXT:    uzp1 z0.h, z0.h, z3.h
-; VBITS_EQ_128-NEXT:    uzp1 z0.b, z0.b, z1.b
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: sdiv_v16i8:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT:    sunpkhi z2.h, z1.b
+; VBITS_GE_128-NEXT:    sunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_128-NEXT:    sunpkhi z4.s, z2.h
+; VBITS_GE_128-NEXT:    sunpkhi z5.s, z3.h
+; VBITS_GE_128-NEXT:    sunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT:    sunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    sunpkhi z3.s, z1.h
+; VBITS_GE_128-NEXT:    sunpkhi z5.s, z0.h
+; VBITS_GE_128-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    uzp1 z1.h, z2.h, z4.h
+; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z3.h
+; VBITS_GE_128-NEXT:    uzp1 z0.b, z0.b, z1.b
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: sdiv_v16i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    sunpkhi z2.s, z1.h
+; VBITS_GE_256-NEXT:    sunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sdiv_v16i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %res = sdiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sdiv_v32i8:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_256-NEXT: sdiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_256-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_512-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_1024-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_1024-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = sdiv <32 x i8> %op1, %op2
@@ -189,66 +178,22 @@ define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
   ret void
 }
 
-define void @sdiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
+define void @sdiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sdiv_v64i8:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_512-NEXT: sdiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_1024-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_2048-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES2]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = sdiv <64 x i8> %op1, %op2
@@ -256,52 +201,25 @@ define void @sdiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @sdiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @sdiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sdiv_v128i8:
-
-; FULL VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_1024-NEXT: sdiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_2048-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_2048-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpkhi z2.s, z1.h
+; CHECK-NEXT:    sunpkhi z3.s, z0.h
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = sdiv <128 x i8> %op1, %op2
@@ -309,35 +227,34 @@ define void @sdiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @sdiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @sdiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sdiv_v256i8:
-
-; FULL VECTOR:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl256
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_2048-NEXT: sdiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_2048-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    sunpkhi z2.h, z1.b
+; CHECK-NEXT:    sunpkhi z3.h, z0.b
+; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpkhi z4.s, z2.h
+; CHECK-NEXT:    sunpkhi z5.s, z3.h
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT:    sunpkhi z5.s, z1.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    sunpkhi z3.s, z0.h
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sdiv z3.s, p1/m, z3.s, z5.s
+; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z4.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = sdiv <256 x i8> %op1, %op2
@@ -348,84 +265,144 @@ define void @sdiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 ; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
-; CHECK-LABEL: sdiv_v4i16:
-; CHECK: sshll v1.4s, v1.4h, #0
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
-; CHECK-NEXT: sshll v0.4s, v0.4h, #0
-; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2:z[0-9]+]].s, [[OP1:z[0-9]+]].s
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov w9, v1.s[2]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.h[1], w8
-; CHECK-NEXT: mov w8, v1.s[3]
-; CHECK-NEXT: mov v0.h[2], w9
-; CHECK-NEXT: mov v0.h[3], w8
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v4i16:
-; VBITS_EQ_128:         sshll v1.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    sshll v0.4s, v0.4h, #0
-; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT:    xtn v0.4h, v0.4s
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: sdiv_v4i16:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    xtn v0.4h, v0.4s
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: sdiv_v4i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    sshll v1.4s, v1.4h, #0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    sshll v0.4s, v0.4h, #0
+; VBITS_GE_256-NEXT:    sdivr z1.s, p0/m, z1.s, z0.s
+; VBITS_GE_256-NEXT:    mov w8, v1.s[1]
+; VBITS_GE_256-NEXT:    mov w9, v1.s[2]
+; VBITS_GE_256-NEXT:    mov v0.16b, v1.16b
+; VBITS_GE_256-NEXT:    mov v0.h[1], w8
+; VBITS_GE_256-NEXT:    mov w8, v1.s[3]
+; VBITS_GE_256-NEXT:    mov v0.h[2], w9
+; VBITS_GE_256-NEXT:    mov v0.h[3], w8
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sdiv_v4i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    sshll v1.4s, v1.4h, #0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_512-NEXT:    sshll v0.4s, v0.4h, #0
+; VBITS_GE_512-NEXT:    sdivr z1.s, p0/m, z1.s, z0.s
+; VBITS_GE_512-NEXT:    mov w8, v1.s[1]
+; VBITS_GE_512-NEXT:    mov w9, v1.s[2]
+; VBITS_GE_512-NEXT:    mov v0.16b, v1.16b
+; VBITS_GE_512-NEXT:    mov v0.h[1], w8
+; VBITS_GE_512-NEXT:    mov w8, v1.s[3]
+; VBITS_GE_512-NEXT:    mov v0.h[2], w9
+; VBITS_GE_512-NEXT:    mov v0.h[3], w8
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_512-NEXT:    ret
   %res = sdiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
-; CHECK-LABEL: sdiv_v8i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; CHECK-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v8i16:
-; VBITS_EQ_128:         ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    sunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT:    sunpkhi z3.s, z0.h
-; VBITS_EQ_128-NEXT:    sunpklo z1.s, z1.h
-; VBITS_EQ_128-NEXT:    sunpklo z0.s, z0.h
-; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT:    uzp1 z0.h, z0.h, z2.h
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: sdiv_v8i16:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sunpkhi z2.s, z1.h
+; VBITS_GE_128-NEXT:    sunpkhi z3.s, z0.h
+; VBITS_GE_128-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: sdiv_v8i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sdiv_v8i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %res = sdiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
 define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; CHECK-LABEL: sdiv_v16i16:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_256-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_512-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_512-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; VBITS_GE_128-LABEL: sdiv_v16i16:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ldp q0, q1, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sunpkhi z6.s, z0.h
+; VBITS_GE_128-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    ldp q3, q2, [x0]
+; VBITS_GE_128-NEXT:    sunpkhi z4.s, z1.h
+; VBITS_GE_128-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT:    sunpkhi z5.s, z2.h
+; VBITS_GE_128-NEXT:    sunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    sunpkhi z5.s, z3.h
+; VBITS_GE_128-NEXT:    sunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT:    sdiv z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT:    sdivr z0.s, p0/m, z0.s, z3.s
+; VBITS_GE_128-NEXT:    sdivr z1.s, p0/m, z1.s, z2.s
+; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z5.h
+; VBITS_GE_128-NEXT:    uzp1 z1.h, z1.h, z4.h
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: sdiv_v16i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sunpkhi z2.s, z1.h
+; VBITS_GE_256-NEXT:    sunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sdiv_v16i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = sdiv <16 x i16> %op1, %op2
@@ -433,34 +410,19 @@ define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
   ret void
 }
 
-define void @sdiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+define void @sdiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sdiv_v32i16:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_512-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_1024-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = sdiv <32 x i16> %op1, %op2
@@ -468,34 +430,19 @@ define void @sdiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @sdiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @sdiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sdiv_v64i16:
-
-; FULL VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_1024-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_2048-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = sdiv <64 x i16> %op1, %op2
@@ -503,21 +450,22 @@ define void @sdiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @sdiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @sdiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sdiv_v128i16:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].h, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_2048-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    sunpkhi z2.s, z1.h
+; CHECK-NEXT:    sunpkhi z3.s, z0.h
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = sdiv <128 x i16> %op1, %op2
@@ -526,45 +474,42 @@ define void @sdiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Vector v2i32 sdiv are not legal for NEON so use SVE when available.
-define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: sdiv_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK: sdiv z0.s, [[PG]]/m, z0.s, z1.s
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v2i32:
-; VBITS_EQ_128:         ptrue p0.s, vl2
-; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = sdiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Vector v4i32 sdiv are not legal for NEON so use SVE when available.
-define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: sdiv_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK: sdiv z0.s, [[PG]]/m, z0.s, z1.s
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v4i32:
-; VBITS_EQ_128:         ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = sdiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = sdiv <8 x i32> %op1, %op2
@@ -573,13 +518,45 @@ define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @sdiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: sdiv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: sdiv_v16i32:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ldp q4, q5, [x1, #32]
+; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z4.s
+; VBITS_GE_128-NEXT:    sdiv z1.s, p0/m, z1.s, z5.s
+; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    ldp q6, q4, [x1]
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    movprfx z0, z2
+; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z6.s
+; VBITS_GE_128-NEXT:    movprfx z1, z3
+; VBITS_GE_128-NEXT:    sdiv z1.s, p0/m, z1.s, z4.s
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: sdiv_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    sdiv z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sdiv_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = sdiv <16 x i32> %op1, %op2
@@ -587,14 +564,15 @@ define void @sdiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @sdiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @sdiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sdiv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = sdiv <32 x i32> %op1, %op2
@@ -602,14 +580,15 @@ define void @sdiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @sdiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @sdiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sdiv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = sdiv <64 x i32> %op1, %op2
@@ -618,45 +597,42 @@ define void @sdiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Vector i64 sdiv are not legal for NEON so use SVE when available.
-define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: sdiv_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK: sdiv z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v1i64:
-; VBITS_EQ_128:         ptrue p0.d, vl1
-; VBITS_EQ_128-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = sdiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Vector i64 sdiv are not legal for NEON so use SVE when available.
-define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: sdiv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK: sdiv z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: sdiv_v2i64:
-; VBITS_EQ_128:         ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = sdiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = sdiv <4 x i64> %op1, %op2
@@ -665,13 +641,45 @@ define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @sdiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: sdiv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: sdiv_v8i64:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
+; VBITS_GE_128-NEXT:    ldp q4, q5, [x1, #32]
+; VBITS_GE_128-NEXT:    sdiv z0.d, p0/m, z0.d, z4.d
+; VBITS_GE_128-NEXT:    sdiv z1.d, p0/m, z1.d, z5.d
+; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    ldp q6, q4, [x1]
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    movprfx z0, z2
+; VBITS_GE_128-NEXT:    sdiv z0.d, p0/m, z0.d, z6.d
+; VBITS_GE_128-NEXT:    movprfx z1, z3
+; VBITS_GE_128-NEXT:    sdiv z1.d, p0/m, z1.d, z4.d
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: sdiv_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sdiv z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    sdiv z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sdiv_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = sdiv <8 x i64> %op1, %op2
@@ -679,14 +687,15 @@ define void @sdiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @sdiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @sdiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sdiv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = sdiv <16 x i64> %op1, %op2
@@ -694,14 +703,15 @@ define void @sdiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @sdiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @sdiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sdiv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = sdiv <32 x i64> %op1, %op2
@@ -716,164 +726,164 @@ define void @sdiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ; Vector vXi8 udiv are not legal for NEON so use SVE when available.
 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
-; CHECK-LABEL: udiv_v8i8:
-; CHECK: ptrue [[PG0:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, z1.b
-; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, z0.b
-; CHECK-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
-; CHECK-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
-; CHECK-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG0]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[DIV]].h, [[DIV]].h
-; CHECK-NEXT: umov [[SCALAR0:w[0-9]+]], [[VEC:v[0-9]+]].h[0]
-; CHECK-NEXT: umov [[SCALAR1:w[0-9]+]], [[VEC]].h[1]
-; CHECK-NEXT: fmov s0, [[SCALAR0]]
-; CHECK-NEXT: umov [[SCALAR2:w[0-9]+]], [[VEC]].h[2]
-; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR1]]
-; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR2]]
-; CHECK-NEXT: umov [[SCALAR3:w[0-9]+]], [[VEC]].h[3]
-; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR3]]
-; CHECK-NEXT: umov [[SCALAR4:w[0-9]+]], [[VEC]].h[4]
-; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR4]]
-; CHECK-NEXT: umov [[SCALAR5:w[0-9]+]], [[VEC]].h[5]
-; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR5]]
-; CHECK-NEXT: umov [[SCALAR6:w[0-9]+]], [[VEC]].h[6]
-; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR6]]
-; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7]
-; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]]
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v8i8:
-; VBITS_EQ_128:         ushll v1.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    ushll v0.8h, v0.8b, #0
-; VBITS_EQ_128-NEXT:    uunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT:    uunpklo z1.s, z1.h
-; VBITS_EQ_128-NEXT:    uunpkhi z3.s, z0.h
-; VBITS_EQ_128-NEXT:    uunpklo z0.s, z0.h
-; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT:    uzp1 z0.h, z0.h, z2.h
-; VBITS_EQ_128-NEXT:    xtn v0.8b, v0.8h
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: udiv_v8i8:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ushll v1.8h, v1.8b, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ushll v0.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    uunpkhi z2.s, z1.h
+; VBITS_GE_128-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT:    uunpkhi z3.s, z0.h
+; VBITS_GE_128-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_128-NEXT:    xtn v0.8b, v0.8h
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: udiv_v8i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    umov w8, v1.h[0]
+; VBITS_GE_256-NEXT:    umov w9, v1.h[1]
+; VBITS_GE_256-NEXT:    fmov s0, w8
+; VBITS_GE_256-NEXT:    umov w8, v1.h[2]
+; VBITS_GE_256-NEXT:    mov v0.b[1], w9
+; VBITS_GE_256-NEXT:    mov v0.b[2], w8
+; VBITS_GE_256-NEXT:    umov w8, v1.h[3]
+; VBITS_GE_256-NEXT:    mov v0.b[3], w8
+; VBITS_GE_256-NEXT:    umov w8, v1.h[4]
+; VBITS_GE_256-NEXT:    mov v0.b[4], w8
+; VBITS_GE_256-NEXT:    umov w8, v1.h[5]
+; VBITS_GE_256-NEXT:    mov v0.b[5], w8
+; VBITS_GE_256-NEXT:    umov w8, v1.h[6]
+; VBITS_GE_256-NEXT:    mov v0.b[6], w8
+; VBITS_GE_256-NEXT:    umov w8, v1.h[7]
+; VBITS_GE_256-NEXT:    mov v0.b[7], w8
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: udiv_v8i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    uzp1 z1.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    umov w8, v1.h[0]
+; VBITS_GE_512-NEXT:    umov w9, v1.h[1]
+; VBITS_GE_512-NEXT:    fmov s0, w8
+; VBITS_GE_512-NEXT:    umov w8, v1.h[2]
+; VBITS_GE_512-NEXT:    mov v0.b[1], w9
+; VBITS_GE_512-NEXT:    mov v0.b[2], w8
+; VBITS_GE_512-NEXT:    umov w8, v1.h[3]
+; VBITS_GE_512-NEXT:    mov v0.b[3], w8
+; VBITS_GE_512-NEXT:    umov w8, v1.h[4]
+; VBITS_GE_512-NEXT:    mov v0.b[4], w8
+; VBITS_GE_512-NEXT:    umov w8, v1.h[5]
+; VBITS_GE_512-NEXT:    mov v0.b[5], w8
+; VBITS_GE_512-NEXT:    umov w8, v1.h[6]
+; VBITS_GE_512-NEXT:    mov v0.b[6], w8
+; VBITS_GE_512-NEXT:    umov w8, v1.h[7]
+; VBITS_GE_512-NEXT:    mov v0.b[7], w8
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_512-NEXT:    ret
   %res = udiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
-; CHECK-LABEL: udiv_v16i8:
-
-; HALF VECTOR:
-; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, z1.b
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, z0.b
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, z1.h
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, z0.h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
-; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_256-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, z1.b
-; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, z0.b
-; VBITS_GE_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
-; VBITS_GE_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
-; VBITS_GE_512-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV]].h, [[DIV]].h
-; VBITS_GE_512-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v16i8:
-; VBITS_EQ_128:         uunpkhi z2.h, z1.b
-; VBITS_EQ_128-NEXT:    uunpkhi z3.h, z0.b
-; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    uunpklo z1.h, z1.b
-; VBITS_EQ_128-NEXT:    uunpkhi z4.s, z2.h
-; VBITS_EQ_128-NEXT:    uunpkhi z5.s, z3.h
-; VBITS_EQ_128-NEXT:    uunpklo z2.s, z2.h
-; VBITS_EQ_128-NEXT:    uunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT:    uunpklo z0.h, z0.b
-; VBITS_EQ_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    uunpkhi z3.s, z1.h
-; VBITS_EQ_128-NEXT:    uunpkhi z5.s, z0.h
-; VBITS_EQ_128-NEXT:    uunpklo z1.s, z1.h
-; VBITS_EQ_128-NEXT:    uunpklo z0.s, z0.h
-; VBITS_EQ_128-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
-; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT:    uzp1 z1.h, z2.h, z4.h
-; VBITS_EQ_128-NEXT:    uzp1 z0.h, z0.h, z3.h
-; VBITS_EQ_128-NEXT:    uzp1 z0.b, z0.b, z1.b
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: udiv_v16i8:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT:    uunpkhi z2.h, z1.b
+; VBITS_GE_128-NEXT:    uunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_128-NEXT:    uunpkhi z4.s, z2.h
+; VBITS_GE_128-NEXT:    uunpkhi z5.s, z3.h
+; VBITS_GE_128-NEXT:    uunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    uunpkhi z3.s, z1.h
+; VBITS_GE_128-NEXT:    uunpkhi z5.s, z0.h
+; VBITS_GE_128-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    uzp1 z1.h, z2.h, z4.h
+; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z3.h
+; VBITS_GE_128-NEXT:    uzp1 z0.b, z0.b, z1.b
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: udiv_v16i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    uunpkhi z2.s, z1.h
+; VBITS_GE_256-NEXT:    uunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: udiv_v16i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %res = udiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: udiv_v32i8:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_256-NEXT: udiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_256-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_512-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_1024-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_1024-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = udiv <32 x i8> %op1, %op2
@@ -881,66 +891,22 @@ define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
   ret void
 }
 
-define void @udiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
+define void @udiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: udiv_v64i8:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_512-NEXT: udiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_1024-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_2048-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES2]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = udiv <64 x i8> %op1, %op2
@@ -948,52 +914,25 @@ define void @udiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @udiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @udiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: udiv_v128i8:
-
-; FULL VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_1024-NEXT: udiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_2048-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_2048-NEXT: st1b { [[UZP2:z[0-9]+]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpkhi z2.s, z1.h
+; CHECK-NEXT:    uunpkhi z3.s, z0.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = udiv <128 x i8> %op1, %op2
@@ -1001,33 +940,34 @@ define void @udiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @udiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @udiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: udiv_v256i8:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl256
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_2048-NEXT: udiv [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_2048-NEXT: st1b { [[UZP3:z[0-9]+]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    uunpkhi z2.h, z1.b
+; CHECK-NEXT:    uunpkhi z3.h, z0.b
+; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpkhi z4.s, z2.h
+; CHECK-NEXT:    uunpkhi z5.s, z3.h
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT:    uunpkhi z5.s, z1.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uunpkhi z3.s, z0.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    udiv z3.s, p1/m, z3.s, z5.s
+; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z4.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = udiv <256 x i8> %op1, %op2
@@ -1038,84 +978,144 @@ define void @udiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 ; Vector vXi16 udiv are not legal for NEON so use SVE when available.
 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
-; CHECK-LABEL: udiv_v4i16:
-; CHECK: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2:z[0-9]+]].s, [[OP1:z[0-9]+]].s
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov w9, v1.s[2]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.h[1], w8
-; CHECK-NEXT: mov w8, v1.s[3]
-; CHECK-NEXT: mov v0.h[2], w9
-; CHECK-NEXT: mov v0.h[3], w8
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v4i16:
-; VBITS_EQ_128:         ushll v1.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    ushll v0.4s, v0.4h, #0
-; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT:    xtn v0.4h, v0.4s
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: udiv_v4i16:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    xtn v0.4h, v0.4s
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: udiv_v4i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ushll v1.4s, v1.4h, #0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    ushll v0.4s, v0.4h, #0
+; VBITS_GE_256-NEXT:    udivr z1.s, p0/m, z1.s, z0.s
+; VBITS_GE_256-NEXT:    mov w8, v1.s[1]
+; VBITS_GE_256-NEXT:    mov w9, v1.s[2]
+; VBITS_GE_256-NEXT:    mov v0.16b, v1.16b
+; VBITS_GE_256-NEXT:    mov v0.h[1], w8
+; VBITS_GE_256-NEXT:    mov w8, v1.s[3]
+; VBITS_GE_256-NEXT:    mov v0.h[2], w9
+; VBITS_GE_256-NEXT:    mov v0.h[3], w8
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: udiv_v4i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ushll v1.4s, v1.4h, #0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_512-NEXT:    ushll v0.4s, v0.4h, #0
+; VBITS_GE_512-NEXT:    udivr z1.s, p0/m, z1.s, z0.s
+; VBITS_GE_512-NEXT:    mov w8, v1.s[1]
+; VBITS_GE_512-NEXT:    mov w9, v1.s[2]
+; VBITS_GE_512-NEXT:    mov v0.16b, v1.16b
+; VBITS_GE_512-NEXT:    mov v0.h[1], w8
+; VBITS_GE_512-NEXT:    mov w8, v1.s[3]
+; VBITS_GE_512-NEXT:    mov v0.h[2], w9
+; VBITS_GE_512-NEXT:    mov v0.h[3], w8
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; VBITS_GE_512-NEXT:    ret
   %res = udiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
-; CHECK-LABEL: udiv_v8i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; CHECK-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v8i16:
-; VBITS_EQ_128:         ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    uunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT:    uunpkhi z3.s, z0.h
-; VBITS_EQ_128-NEXT:    uunpklo z1.s, z1.h
-; VBITS_EQ_128-NEXT:    uunpklo z0.s, z0.h
-; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT:    uzp1 z0.h, z0.h, z2.h
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: udiv_v8i16:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    uunpkhi z2.s, z1.h
+; VBITS_GE_128-NEXT:    uunpkhi z3.s, z0.h
+; VBITS_GE_128-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: udiv_v8i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: udiv_v8i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %res = udiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
 define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; CHECK-LABEL: udiv_v16i16:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_256-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_512-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_512-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; VBITS_GE_128-LABEL: udiv_v16i16:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ldp q0, q1, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    uunpkhi z6.s, z0.h
+; VBITS_GE_128-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    ldp q3, q2, [x0]
+; VBITS_GE_128-NEXT:    uunpkhi z4.s, z1.h
+; VBITS_GE_128-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_128-NEXT:    uunpkhi z5.s, z2.h
+; VBITS_GE_128-NEXT:    uunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    uunpkhi z5.s, z3.h
+; VBITS_GE_128-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT:    udiv z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT:    udivr z0.s, p0/m, z0.s, z3.s
+; VBITS_GE_128-NEXT:    udivr z1.s, p0/m, z1.s, z2.s
+; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z5.h
+; VBITS_GE_128-NEXT:    uzp1 z1.h, z1.h, z4.h
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: udiv_v16i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    uunpkhi z2.s, z1.h
+; VBITS_GE_256-NEXT:    uunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: udiv_v16i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = udiv <16 x i16> %op1, %op2
@@ -1123,34 +1123,19 @@ define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
   ret void
 }
 
-define void @udiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+define void @udiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: udiv_v32i16:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_512-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_1024-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = udiv <32 x i16> %op1, %op2
@@ -1158,34 +1143,19 @@ define void @udiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @udiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @udiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: udiv_v64i16:
-
-; FULL VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_1024-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_2048-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = udiv <64 x i16> %op1, %op2
@@ -1193,21 +1163,22 @@ define void @udiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @udiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @udiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: udiv_v128i16:
-; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: udivr   [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_2048-NEXT: udiv    [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
-; VBITS_EQ_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_EQ_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    uunpkhi z2.s, z1.h
+; CHECK-NEXT:    uunpkhi z3.s, z0.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = udiv <128 x i16> %op1, %op2
@@ -1216,46 +1187,42 @@ define void @udiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Vector v2i32 udiv are not legal for NEON so use SVE when available.
-define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: udiv_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK: udiv z0.s, [[PG]]/m, z0.s, z1.s
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v2i32:
-; VBITS_EQ_128:         ptrue p0.s, vl2
-; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = udiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Vector v4i32 udiv are not legal for NEON so use SVE when available.
-define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: udiv_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK: udiv z0.s, [[PG]]/m, z0.s, z1.s
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v4i32:
-; VBITS_EQ_128:         ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = udiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: udiv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = udiv <8 x i32> %op1, %op2
@@ -1264,13 +1231,45 @@ define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @udiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: udiv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: udiv_v16i32:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ldp q4, q5, [x1, #32]
+; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z4.s
+; VBITS_GE_128-NEXT:    udiv z1.s, p0/m, z1.s, z5.s
+; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    ldp q6, q4, [x1]
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    movprfx z0, z2
+; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z6.s
+; VBITS_GE_128-NEXT:    movprfx z1, z3
+; VBITS_GE_128-NEXT:    udiv z1.s, p0/m, z1.s, z4.s
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: udiv_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    udiv z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: udiv_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = udiv <16 x i32> %op1, %op2
@@ -1278,14 +1277,15 @@ define void @udiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @udiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @udiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: udiv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = udiv <32 x i32> %op1, %op2
@@ -1293,14 +1293,15 @@ define void @udiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @udiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @udiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: udiv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = udiv <64 x i32> %op1, %op2
@@ -1309,45 +1310,42 @@ define void @udiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Vector i64 udiv are not legal for NEON so use SVE when available.
-define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: udiv_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK: udiv z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v1i64:
-; VBITS_EQ_128:         ptrue p0.d, vl1
-; VBITS_EQ_128-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = udiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Vector i64 udiv are not legal for NEON so use SVE when available.
-define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: udiv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK: udiv z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: udiv_v2i64:
-; VBITS_EQ_128:         ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = udiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: udiv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = udiv <4 x i64> %op1, %op2
@@ -1356,13 +1354,45 @@ define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @udiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: udiv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: udiv_v8i64:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
+; VBITS_GE_128-NEXT:    ldp q4, q5, [x1, #32]
+; VBITS_GE_128-NEXT:    udiv z0.d, p0/m, z0.d, z4.d
+; VBITS_GE_128-NEXT:    udiv z1.d, p0/m, z1.d, z5.d
+; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    ldp q6, q4, [x1]
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    movprfx z0, z2
+; VBITS_GE_128-NEXT:    udiv z0.d, p0/m, z0.d, z6.d
+; VBITS_GE_128-NEXT:    movprfx z1, z3
+; VBITS_GE_128-NEXT:    udiv z1.d, p0/m, z1.d, z4.d
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: udiv_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    udiv z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    udiv z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: udiv_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = udiv <8 x i64> %op1, %op2
@@ -1370,14 +1400,15 @@ define void @udiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @udiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @udiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: udiv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = udiv <16 x i64> %op1, %op2
@@ -1385,14 +1416,15 @@ define void @udiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @udiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @udiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: udiv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = udiv <32 x i64> %op1, %op2
@@ -1402,14 +1434,15 @@ define void @udiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 
 ; This used to crash because isUnaryPredicate and BuildUDIV don't know how
 ; a SPLAT_VECTOR of fixed vector type should be handled.
-define void @udiv_constantsplat_v8i32(<8 x i32>* %a) #1 {
+define void @udiv_constantsplat_v8i32(<8 x i32>* %a) vscale_range(2,0) #1 {
 ; CHECK-LABEL: udiv_constantsplat_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: mov [[OP2:z[0-9]+]].s, #95
-; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    mov z1.s, #95 // =0x5f
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
   store <8 x i32> %res, <8 x i32>* %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
index bfe4b47242c3..b9f5c12c331a 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
@@ -1,25 +1,10 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: z{0-9}
-
 ;
 ; sext i1 -> i32
 ;
@@ -27,15 +12,17 @@ target triple = "aarch64-unknown-linux-gnu"
 ; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg
 ; type's element type is not byte based and thus cannot be lowered directly to
 ; an SVE instruction.
-define void @sext_v8i1_v8i32(<8 x i1> %a, <8 x i32>* %out) #0 {
+define void @sext_v8i1_v8i32(<8 x i1> %a, <8 x i32>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v8i1_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; CHECK-NEXT: lsl [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, #31
-; CHECK-NEXT: asr [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, #31
-; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, #31
+; CHECK-NEXT:    asr z0.s, p0/m, z0.s, #31
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = sext <8 x i1> %a to <8 x i32>
   store <8 x i32> %b, <8 x i32>* %out
   ret void
@@ -48,15 +35,17 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, <8 x i32>* %out) #0 {
 ; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg
 ; type's element type is not power-of-2 based and thus cannot be lowered
 ; directly to an SVE instruction.
-define void @sext_v4i3_v4i64(<4 x i3> %a, <4 x i64>* %out) #0 {
+define void @sext_v4i3_v4i64(<4 x i3> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v4i3_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; CHECK-NEXT: lsl [[A_DWORDS]].d, [[PG]]/m, [[A_DWORDS]].d, #61
-; CHECK-NEXT: asr [[A_DWORDS]].d, [[PG]]/m, [[A_DWORDS]].d, #61
-; CHECK-NEXT: st1d { [[A_WORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, #61
+; CHECK-NEXT:    asr z0.d, p0/m, z0.d, #61
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = sext <4 x i3> %a to <4 x i64>
   store <4 x i64> %b, <4 x i64>* %out
   ret void
@@ -66,12 +55,14 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, <4 x i64>* %out) #0 {
 ; sext i8 -> i16
 ;
 
-define void @sext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 {
+define void @sext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v16i8_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; CHECK-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, <16 x i16>* %out
   ret void
@@ -79,12 +70,29 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 {
 
 ; NOTE: Extra 'add' is to prevent the extend being combined with the load.
 define void @sext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 {
-; CHECK-LABEL: sext_v32i8_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: sext_v32i8_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    add z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    sunpklo z1.h, z0.b
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sext_v32i8_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    add z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <32 x i8>, <32 x i8>* %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i16>
@@ -92,13 +100,16 @@ define void @sext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 {
   ret void
 }
 
-define void @sext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 {
+define void @sext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sext_v64i8_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x i8>, <64 x i8>* %in
   %b = add <64 x i8> %a, %a
   %c = sext <64 x i8> %b to <64 x i16>
@@ -106,13 +117,16 @@ define void @sext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 {
   ret void
 }
 
-define void @sext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 {
+define void @sext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sext_v128i8_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <128 x i8>, <128 x i8>* %in
   %b = add <128 x i8> %a, %a
   %c = sext <128 x i8> %b to <128 x i16>
@@ -124,50 +138,59 @@ define void @sext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 {
 ; sext i8 -> i32
 ;
 
-define void @sext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) #0 {
+define void @sext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v8i8_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; CHECK-NEXT: st1w { [[A_HALFS]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, <8 x i32>* %out
   ret void
 }
 
 define void @sext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 {
-; CHECK-LABEL: sext_v16i8_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ext v[[A_HI:[0-9]+]].16b, v0.16b, v0.16b, #8
-; VBITS_EQ_256-DAG: sunpklo [[A_HALFS_LO:z[0-9]+]].h, z0.b
-; VBITS_EQ_256-DAG: sunpklo [[A_HALFS_HI:z[0-9]+]].h, z[[A_HI]].b
-; VBITS_EQ_256-DAG: sunpklo [[A_WORDS_LO:z[0-9]+]].s, [[A_HALFS_LO]].h
-; VBITS_EQ_256-DAG: sunpklo [[A_WORDS_HI:z[0-9]+]].s, [[A_HALFS_HI]].h
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: st1w { [[A_WORDS_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[A_WORDS_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: sext_v16i8_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sext_v16i8_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, <16 x i32>* %out
   ret void
 }
 
-define void @sext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 {
+define void @sext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sext_v32i8_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i8>, <32 x i8>* %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i32>
@@ -175,14 +198,17 @@ define void @sext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 {
   ret void
 }
 
-define void @sext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 {
+define void @sext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sext_v64i8_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x i8>, <64 x i8>* %in
   %b = add <64 x i8> %a, %a
   %c = sext <64 x i8> %b to <64 x i32>
@@ -197,54 +223,77 @@ define void @sext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 {
 ; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The sign
 ; extend is a two step process where the container is any_extend'd with the
 ; result feeding an inreg sign extend.
-define void @sext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) #0 {
+define void @sext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v4i8_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[ANYEXT_W:z[0-9]+]].s, z0.h
-; CHECK-NEXT: uunpklo [[ANYEXT_D:z[0-9]+]].d, [[ANYEXT_W]].s
-; CHECK-NEXT: sxtb [[A_DWORDS:z[0-9]+]].d, [[PG]]/m, [[ANYEXT_D]].d
-; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = sext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, <4 x i64>* %out
   ret void
 }
 
 define void @sext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 {
-; CHECK-LABEL: sext_v8i8_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: sext_v8i8_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    sshll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sext_v8i8_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, <8 x i64>* %out
   ret void
 }
 
-define void @sext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 {
+define void @sext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sext_v16i8_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, <16 x i64>* %out
   ret void
 }
 
-define void @sext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 {
+define void @sext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sext_v32i8_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i8>, <32 x i8>* %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i64>
@@ -256,24 +305,43 @@ define void @sext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 {
 ; sext i16 -> i32
 ;
 
-define void @sext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) #0 {
+define void @sext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v8i16_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, <8 x i32>* %out
   ret void
 }
 
 define void @sext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 {
-; CHECK-LABEL: sext_v16i16_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: sext_v16i16_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    add z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sext_v16i16_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    add z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <16 x i16>, <16 x i16>* %in
   %b = add <16 x i16> %a, %a
   %c = sext <16 x i16> %b to <16 x i32>
@@ -281,13 +349,16 @@ define void @sext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 {
   ret void
 }
 
-define void @sext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 {
+define void @sext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sext_v32i16_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i16>, <32 x i16>* %in
   %b = add <32 x i16> %a, %a
   %c = sext <32 x i16> %b to <32 x i32>
@@ -295,13 +366,16 @@ define void @sext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 {
   ret void
 }
 
-define void @sext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 {
+define void @sext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sext_v64i16_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x i16>, <64 x i16>* %in
   %b = add <64 x i16> %a, %a
   %c = sext <64 x i16> %b to <64 x i32>
@@ -313,38 +387,59 @@ define void @sext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 {
 ; sext i16 -> i64
 ;
 
-define void @sext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) #0 {
+define void @sext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v4i16_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; CHECK-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = sext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, <4 x i64>* %out
   ret void
 }
 
 define void @sext_v8i16_v8i64(<8 x i16> %a, <8 x i64>* %out) #0 {
-; CHECK-LABEL: sext_v8i16_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: sext_v8i16_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sext_v8i16_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, <8 x i64>* %out
   ret void
 }
 
-define void @sext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 {
+define void @sext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sext_v16i16_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <16 x i16>, <16 x i16>* %in
   %b = add <16 x i16> %a, %a
   %c = sext <16 x i16> %b to <16 x i64>
@@ -352,14 +447,17 @@ define void @sext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 {
   ret void
 }
 
-define void @sext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 {
+define void @sext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sext_v32i16_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i16>, <32 x i16>* %in
   %b = add <32 x i16> %a, %a
   %c = sext <32 x i16> %b to <32 x i64>
@@ -371,24 +469,43 @@ define void @sext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 {
 ; sext i32 -> i64
 ;
 
-define void @sext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) #0 {
+define void @sext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v4i32_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, z0.s
-; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = sext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, <4 x i64>* %out
   ret void
 }
 
 define void @sext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 {
-; CHECK-LABEL: sext_v8i32_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
-; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: sext_v8i32_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    add z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sext_v8i32_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    add z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <8 x i32>, <8 x i32>* %in
   %b = add <8 x i32> %a, %a
   %c = sext <8 x i32> %b to <8 x i64>
@@ -396,13 +513,16 @@ define void @sext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 {
   ret void
 }
 
-define void @sext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 {
+define void @sext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sext_v16i32_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
-; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <16 x i32>, <16 x i32>* %in
   %b = add <16 x i32> %a, %a
   %c = sext <16 x i32> %b to <16 x i64>
@@ -410,13 +530,16 @@ define void @sext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 {
   ret void
 }
 
-define void @sext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 {
+define void @sext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sext_v32i32_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
-; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i32>, <32 x i32>* %in
   %b = add <32 x i32> %a, %a
   %c = sext <32 x i32> %b to <32 x i64>
@@ -428,12 +551,14 @@ define void @sext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 {
 ; zext i8 -> i16
 ;
 
-define void @zext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 {
+define void @zext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: zext_v16i8_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; CHECK-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, <16 x i16>* %out
   ret void
@@ -441,12 +566,29 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 {
 
 ; NOTE: Extra 'add' is to prevent the extend being combined with the load.
 define void @zext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 {
-; CHECK-LABEL: zext_v32i8_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: zext_v32i8_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    add z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uunpklo z1.h, z0.b
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: zext_v32i8_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    add z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <32 x i8>, <32 x i8>* %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i16>
@@ -454,13 +596,16 @@ define void @zext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 {
   ret void
 }
 
-define void @zext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 {
+define void @zext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: zext_v64i8_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x i8>, <64 x i8>* %in
   %b = add <64 x i8> %a, %a
   %c = zext <64 x i8> %b to <64 x i16>
@@ -468,13 +613,16 @@ define void @zext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 {
   ret void
 }
 
-define void @zext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 {
+define void @zext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: zext_v128i8_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <128 x i8>, <128 x i8>* %in
   %b = add <128 x i8> %a, %a
   %c = zext <128 x i8> %b to <128 x i16>
@@ -486,50 +634,59 @@ define void @zext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 {
 ; zext i8 -> i32
 ;
 
-define void @zext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) #0 {
+define void @zext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: zext_v8i8_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; CHECK-NEXT: st1w { [[A_HALFS]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, <8 x i32>* %out
   ret void
 }
 
 define void @zext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 {
-; CHECK-LABEL: zext_v16i8_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ext v[[A_HI:[0-9]+]].16b, v0.16b, v0.16b, #8
-; VBITS_EQ_256-DAG: uunpklo [[A_HALFS_LO:z[0-9]+]].h, z0.b
-; VBITS_EQ_256-DAG: uunpklo [[A_HALFS_HI:z[0-9]+]].h, z[[A_HI]].b
-; VBITS_EQ_256-DAG: uunpklo [[A_WORDS_LO:z[0-9]+]].s, [[A_HALFS_LO]].h
-; VBITS_EQ_256-DAG: uunpklo [[A_WORDS_HI:z[0-9]+]].s, [[A_HALFS_HI]].h
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[OUT_HI:[0-9]+]], #8
-; VBITS_EQ_256-DAG: st1w { [[A_WORDS_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[A_WORDS_HI]].s }, [[PG]], [x0, x[[OUT_HI]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: zext_v16i8_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: zext_v16i8_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, <16 x i32>* %out
   ret void
 }
 
-define void @zext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 {
+define void @zext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: zext_v32i8_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i8>, <32 x i8>* %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i32>
@@ -537,14 +694,17 @@ define void @zext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 {
   ret void
 }
 
-define void @zext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 {
+define void @zext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: zext_v64i8_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x i8>, <64 x i8>* %in
   %b = add <64 x i8> %a, %a
   %c = zext <64 x i8> %b to <64 x i32>
@@ -559,54 +719,77 @@ define void @zext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 {
 ; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The zero
 ; extend is a two step process where the container is zero_extend_inreg'd with
 ; the result feeding a normal zero extend from halfs to doublewords.
-define void @zext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) #0 {
+define void @zext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: zext_v4i8_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: bic v0.4h, #255, lsl #8
-; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = zext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, <4 x i64>* %out
   ret void
 }
 
 define void @zext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 {
-; CHECK-LABEL: zext_v8i8_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: zext_v8i8_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ushll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: zext_v8i8_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, <8 x i64>* %out
   ret void
 }
 
-define void @zext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 {
+define void @zext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: zext_v16i8_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
-; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, <16 x i64>* %out
   ret void
 }
 
-define void @zext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 {
+define void @zext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: zext_v32i8_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
-; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
-; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i8>, <32 x i8>* %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i64>
@@ -618,24 +801,43 @@ define void @zext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 {
 ; zext i16 -> i32
 ;
 
-define void @zext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) #0 {
+define void @zext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: zext_v8i16_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, <8 x i32>* %out
   ret void
 }
 
 define void @zext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 {
-; CHECK-LABEL: zext_v16i16_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: zext_v16i16_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    add z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: zext_v16i16_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    add z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <16 x i16>, <16 x i16>* %in
   %b = add <16 x i16> %a, %a
   %c = zext <16 x i16> %b to <16 x i32>
@@ -643,13 +845,16 @@ define void @zext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 {
   ret void
 }
 
-define void @zext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 {
+define void @zext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: zext_v32i16_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i16>, <32 x i16>* %in
   %b = add <32 x i16> %a, %a
   %c = zext <32 x i16> %b to <32 x i32>
@@ -657,13 +862,16 @@ define void @zext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 {
   ret void
 }
 
-define void @zext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 {
+define void @zext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: zext_v64i16_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x i16>, <64 x i16>* %in
   %b = add <64 x i16> %a, %a
   %c = zext <64 x i16> %b to <64 x i32>
@@ -675,38 +883,59 @@ define void @zext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 {
 ; zext i16 -> i64
 ;
 
-define void @zext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) #0 {
+define void @zext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: zext_v4i16_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = zext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, <4 x i64>* %out
   ret void
 }
 
 define void @zext_v8i16_v8i64(<8 x i16> %a, <8 x i64>* %out) #0 {
-; CHECK-LABEL: zext_v8i16_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
-; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: zext_v8i16_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: zext_v8i16_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, <8 x i64>* %out
   ret void
 }
 
-define void @zext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 {
+define void @zext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: zext_v16i16_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <16 x i16>, <16 x i16>* %in
   %b = add <16 x i16> %a, %a
   %c = zext <16 x i16> %b to <16 x i64>
@@ -714,14 +943,17 @@ define void @zext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 {
   ret void
 }
 
-define void @zext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 {
+define void @zext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: zext_v32i16_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
-; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
-; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i16>, <32 x i16>* %in
   %b = add <32 x i16> %a, %a
   %c = zext <32 x i16> %b to <32 x i64>
@@ -733,24 +965,43 @@ define void @zext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 {
 ; zext i32 -> i64
 ;
 
-define void @zext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) #0 {
+define void @zext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: zext_v4i32_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, z0.s
-; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %b = zext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, <4 x i64>* %out
   ret void
 }
 
 define void @zext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 {
-; CHECK-LABEL: zext_v8i32_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
-; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: zext_v8i32_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    add z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: zext_v8i32_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    add z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <8 x i32>, <8 x i32>* %in
   %b = add <8 x i32> %a, %a
   %c = zext <8 x i32> %b to <8 x i64>
@@ -758,13 +1009,16 @@ define void @zext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 {
   ret void
 }
 
-define void @zext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 {
+define void @zext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: zext_v16i32_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
-; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <16 x i32>, <16 x i32>* %in
   %b = add <16 x i32> %a, %a
   %c = zext <16 x i32> %b to <16 x i64>
@@ -772,13 +1026,16 @@ define void @zext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 {
   ret void
 }
 
-define void @zext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 {
+define void @zext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: zext_v32i32_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
-; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
-; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i32>, <32 x i32>* %in
   %b = add <32 x i32> %a, %a
   %c = zext <32 x i32> %b to <32 x i64>

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll
index 0833ed53a932..9e9ce74f7b1b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll
@@ -1,60 +1,43 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
-
-; VBYTES represents the useful byte size of a vector register from the code
-; generator's point of view. It is clamped to power-of-2 values because
-; only power-of-2 vector lengths are considered legal, regardless of the
-; user specified vector length.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; AND
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: and_v8i8:
-; CHECK: and v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = and <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: and_v16i8:
-; CHECK: and v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = and <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @and_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @and_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: and_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = and <32 x i8> %op1, %op2
@@ -63,18 +46,28 @@ define void @and_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @and_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: and_v64i8:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-DAG: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_LE_256-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
-; VBITS_LE_256-DAG: and [[RES_1:z[0-9]+]].d, [[OP1_1]].d, [[OP2_1]].d
-; VBITS_LE_256-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
-; CHECK: ret
+; VBITS_GE_256-LABEL: and_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    and z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    and z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: and_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    and z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = and <64 x i8> %op1, %op2
@@ -82,29 +75,15 @@ define void @and_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @and_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @and_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: and_v128i8:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-DAG: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_LE_512-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
-; VBITS_LE_512-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
-; VBITS_LE_512-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
-; VBITS_LE_512-DAG: and [[RES_1:z[0-9]+]].d, [[OP1_1]].d, [[OP2_1]].d
-; VBITS_LE_512-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
-; VBITS_LE_256-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]]
-; VBITS_LE_256-DAG: and [[RES_2:z[0-9]+]].d, [[OP1_2]].d, [[OP2_2]].d
-; VBITS_LE_256-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]]
-; VBITS_LE_256-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]]
-; VBITS_LE_256-DAG: and [[RES_3:z[0-9]+]].d, [[OP1_3]].d, [[OP2_3]].d
-; VBITS_LE_256-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = and <128 x i8> %op1, %op2
@@ -112,49 +91,15 @@ define void @and_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @and_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @and_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: and_v256i8:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-DAG: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_LE_1024-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
-; VBITS_LE_1024-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
-; VBITS_LE_1024-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
-; VBITS_LE_1024-DAG: and [[RES_1:z[0-9]+]].d, [[OP1_1]].d, [[OP2_1]].d
-; VBITS_LE_1024-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
-; VBITS_LE_512-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]]
-; VBITS_LE_512-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]]
-; VBITS_LE_512-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]]
-; VBITS_LE_512-DAG: and [[RES_2:z[0-9]+]].d, [[OP1_2]].d, [[OP2_2]].d
-; VBITS_LE_512-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]]
-; VBITS_LE_512-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]]
-; VBITS_LE_512-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]]
-; VBITS_LE_512-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]]
-; VBITS_LE_512-DAG: and [[RES_3:z[0-9]+]].d, [[OP1_3]].d, [[OP2_3]].d
-; VBITS_LE_512-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]]
-; VBITS_LE_256-DAG: mov w[[OFF_4:[0-9]+]], #[[#mul(VBYTES,4)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_4:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_4]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_4:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_4]]]
-; VBITS_LE_256-DAG: and [[RES_4:z[0-9]+]].d, [[OP1_4]].d, [[OP2_4]].d
-; VBITS_LE_256-DAG: st1b { [[RES_4]].b }, [[PG]], [x0, x[[OFF_4]]]
-; VBITS_LE_256-DAG: mov w[[OFF_5:[0-9]+]], #[[#mul(VBYTES,5)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_5:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_5]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_5:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_5]]]
-; VBITS_LE_256-DAG: and [[RES_5:z[0-9]+]].d, [[OP1_5]].d, [[OP2_5]].d
-; VBITS_LE_256-DAG: st1b { [[RES_5]].b }, [[PG]], [x0, x[[OFF_5]]]
-; VBITS_LE_256-DAG: mov w[[OFF_6:[0-9]+]], #[[#mul(VBYTES,6)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_6:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_6]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_6:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_6]]]
-; VBITS_LE_256-DAG: and [[RES_6:z[0-9]+]].d, [[OP1_6]].d, [[OP2_6]].d
-; VBITS_LE_256-DAG: st1b { [[RES_6]].b }, [[PG]], [x0, x[[OFF_6]]]
-; VBITS_LE_256-DAG: mov w[[OFF_7:[0-9]+]], #[[#mul(VBYTES,7)]]
-; VBITS_LE_256-DAG: ld1b { [[OP1_7:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_7]]]
-; VBITS_LE_256-DAG: ld1b { [[OP2_7:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_7]]]
-; VBITS_LE_256-DAG: and [[RES_7:z[0-9]+]].d, [[OP1_7]].d, [[OP2_7]].d
-; VBITS_LE_256-DAG: st1b { [[RES_7]].b }, [[PG]], [x0, x[[OFF_7]]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = and <256 x i8> %op1, %op2
@@ -163,31 +108,34 @@ define void @and_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: and_v4i16:
-; CHECK: and v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = and <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: and_v8i16:
-; CHECK: and v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = and <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
-define void @and_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @and_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: and_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = and <16 x i16> %op1, %op2
@@ -195,16 +143,29 @@ define void @and_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
 define void @and_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: and_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: and_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    and z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    and z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: and_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    and z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = and <32 x i16> %op1, %op2
@@ -212,16 +173,15 @@ define void @and_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
-define void @and_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @and_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: and_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = and <64 x i16> %op1, %op2
@@ -229,16 +189,15 @@ define void @and_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
-define void @and_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @and_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: and_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = and <128 x i16> %op1, %op2
@@ -247,31 +206,34 @@ define void @and_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: and_v2i32:
-; CHECK: and v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = and <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: and_v4i32:
-; CHECK: and v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = and <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @and_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @and_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: and_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = and <8 x i32> %op1, %op2
@@ -279,16 +241,29 @@ define void @and_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
 define void @and_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: and_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: and_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    and z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    and z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: and_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    and z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = and <16 x i32> %op1, %op2
@@ -296,16 +271,15 @@ define void @and_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
-define void @and_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @and_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: and_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = and <32 x i32> %op1, %op2
@@ -313,16 +287,15 @@ define void @and_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
-define void @and_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @and_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: and_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = and <64 x i32> %op1, %op2
@@ -331,31 +304,34 @@ define void @and_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: and_v1i64:
-; CHECK: and v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = and <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: and_v2i64:
-; CHECK: and v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = and <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @and_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @and_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: and_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = and <4 x i64> %op1, %op2
@@ -363,16 +339,29 @@ define void @and_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
 define void @and_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: and_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: and_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    and z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    and z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: and_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    and z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = and <8 x i64> %op1, %op2
@@ -380,16 +369,15 @@ define void @and_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
-define void @and_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @and_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: and_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = and <16 x i64> %op1, %op2
@@ -397,16 +385,15 @@ define void @and_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-; NOTE: Check lines only cover the first VBYTES because the and_v#i8 tests
-; already cover the general legalisation cases.
-define void @and_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @and_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: and_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: and [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = and <32 x i64> %op1, %op2
@@ -414,41 +401,39 @@ define void @and_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
   ret void
 }
 
-;
-; NOTE: Tests beyond this point only have CHECK lines to validate the first
-; VBYTES because the and tests already validate the legalisation code paths.
-;
-
 ;
 ; OR
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: or_v8i8:
-; CHECK: orr v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = or <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: or_v16i8:
-; CHECK: orr v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = or <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @or_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @or_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: or_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = or <32 x i8> %op1, %op2
@@ -457,13 +442,28 @@ define void @or_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @or_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: or_v64i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: or_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    orr z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    orr z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: or_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    orr z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = or <64 x i8> %op1, %op2
@@ -471,14 +471,15 @@ define void @or_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @or_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @or_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: or_v128i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = or <128 x i8> %op1, %op2
@@ -486,14 +487,15 @@ define void @or_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @or_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @or_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: or_v256i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = or <256 x i8> %op1, %op2
@@ -502,31 +504,34 @@ define void @or_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: or_v4i16:
-; CHECK: orr v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = or <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: or_v8i16:
-; CHECK: orr v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = or <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
-define void @or_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @or_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: or_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = or <16 x i16> %op1, %op2
@@ -535,13 +540,28 @@ define void @or_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @or_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: or_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: or_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    orr z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    orr z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: or_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    orr z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = or <32 x i16> %op1, %op2
@@ -549,14 +569,15 @@ define void @or_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @or_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @or_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: or_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = or <64 x i16> %op1, %op2
@@ -564,14 +585,15 @@ define void @or_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @or_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @or_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: or_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = or <128 x i16> %op1, %op2
@@ -580,31 +602,34 @@ define void @or_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: or_v2i32:
-; CHECK: orr v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = or <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: or_v4i32:
-; CHECK: orr v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = or <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @or_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @or_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: or_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = or <8 x i32> %op1, %op2
@@ -613,13 +638,28 @@ define void @or_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @or_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: or_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: or_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    orr z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    orr z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: or_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    orr z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = or <16 x i32> %op1, %op2
@@ -627,14 +667,15 @@ define void @or_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @or_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @or_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: or_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = or <32 x i32> %op1, %op2
@@ -642,14 +683,15 @@ define void @or_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @or_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @or_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: or_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = or <64 x i32> %op1, %op2
@@ -658,31 +700,34 @@ define void @or_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: or_v1i64:
-; CHECK: orr v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = or <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: or_v2i64:
-; CHECK: orr v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = or <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @or_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @or_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: or_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = or <4 x i64> %op1, %op2
@@ -691,13 +736,28 @@ define void @or_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @or_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: or_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: or_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    orr z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    orr z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: or_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    orr z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = or <8 x i64> %op1, %op2
@@ -705,14 +765,15 @@ define void @or_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @or_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @or_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: or_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = or <16 x i64> %op1, %op2
@@ -720,14 +781,15 @@ define void @or_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @or_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @or_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: or_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: orr [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = or <32 x i64> %op1, %op2
@@ -740,31 +802,34 @@ define void @or_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: xor_v8i8:
-; CHECK: eor v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = xor <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: xor_v16i8:
-; CHECK: eor v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = xor <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @xor_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @xor_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: xor_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = xor <32 x i8> %op1, %op2
@@ -773,13 +838,28 @@ define void @xor_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @xor_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: xor_v64i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: xor_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    eor z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    eor z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: xor_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    eor z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = xor <64 x i8> %op1, %op2
@@ -787,14 +867,15 @@ define void @xor_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @xor_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @xor_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: xor_v128i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = xor <128 x i8> %op1, %op2
@@ -802,14 +883,15 @@ define void @xor_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @xor_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @xor_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: xor_v256i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = xor <256 x i8> %op1, %op2
@@ -818,31 +900,34 @@ define void @xor_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: xor_v4i16:
-; CHECK: eor v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = xor <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: xor_v8i16:
-; CHECK: eor v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = xor <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
-define void @xor_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @xor_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: xor_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = xor <16 x i16> %op1, %op2
@@ -851,13 +936,28 @@ define void @xor_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @xor_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: xor_v32i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: xor_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    eor z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    eor z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: xor_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    eor z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = xor <32 x i16> %op1, %op2
@@ -865,14 +965,15 @@ define void @xor_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @xor_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @xor_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: xor_v64i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = xor <64 x i16> %op1, %op2
@@ -880,14 +981,15 @@ define void @xor_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @xor_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @xor_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: xor_v128i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = xor <128 x i16> %op1, %op2
@@ -896,31 +998,34 @@ define void @xor_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: xor_v2i32:
-; CHECK: eor v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = xor <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: xor_v4i32:
-; CHECK: eor v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = xor <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @xor_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @xor_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: xor_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = xor <8 x i32> %op1, %op2
@@ -929,13 +1034,28 @@ define void @xor_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @xor_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: xor_v16i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: xor_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    eor z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    eor z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: xor_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    eor z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = xor <16 x i32> %op1, %op2
@@ -943,14 +1063,15 @@ define void @xor_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @xor_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @xor_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: xor_v32i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = xor <32 x i32> %op1, %op2
@@ -958,14 +1079,15 @@ define void @xor_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @xor_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @xor_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: xor_v64i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = xor <64 x i32> %op1, %op2
@@ -974,31 +1096,34 @@ define void @xor_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: xor_v1i64:
-; CHECK: eor v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = xor <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: xor_v2i64:
-; CHECK: eor v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = xor <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @xor_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @xor_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: xor_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = xor <4 x i64> %op1, %op2
@@ -1007,13 +1132,28 @@ define void @xor_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @xor_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: xor_v8i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; VBITS_GE_256-LABEL: xor_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    eor z0.d, z0.d, z2.d
+; VBITS_GE_256-NEXT:    eor z1.d, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: xor_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    eor z0.d, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = xor <8 x i64> %op1, %op2
@@ -1021,14 +1161,15 @@ define void @xor_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @xor_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @xor_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: xor_v16i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = xor <16 x i64> %op1, %op2
@@ -1036,14 +1177,15 @@ define void @xor_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @xor_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @xor_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: xor_v32i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK: eor [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = xor <32 x i64> %op1, %op2

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
index 065c278299b7..ca8bf9438200 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
@@ -1,55 +1,43 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; SMAX
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v8i8:
-; CHECK: smax v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smax v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v16i8:
-; CHECK: smax v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smax v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
 
-define void @smax_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @smax_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: smax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -58,26 +46,28 @@ define void @smax_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @smax_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: smax_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: smax_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smax z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT:    smax z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: smax [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: smax [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_512-LABEL: smax_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smax z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %op1, <64 x i8> %op2)
@@ -85,14 +75,15 @@ define void @smax_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @smax_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @smax_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: smax_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = call <128 x i8> @llvm.smax.v128i8(<128 x i8> %op1, <128 x i8> %op2)
@@ -100,14 +91,15 @@ define void @smax_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @smax_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @smax_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: smax_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = call <256 x i8> @llvm.smax.v256i8(<256 x i8> %op1, <256 x i8> %op2)
@@ -116,31 +108,34 @@ define void @smax_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v4i16:
-; CHECK: smax v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smax v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v8i16:
-; CHECK: smax v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smax v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
 
-define void @smax_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @smax_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: smax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -149,26 +144,28 @@ define void @smax_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @smax_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: smax_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: smax [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: smax [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smax_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smax z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    smax z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: smax_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smax z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %op1, <32 x i16> %op2)
@@ -176,14 +173,15 @@ define void @smax_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @smax_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @smax_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: smax_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = call <64 x i16> @llvm.smax.v64i16(<64 x i16> %op1, <64 x i16> %op2)
@@ -191,14 +189,15 @@ define void @smax_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @smax_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @smax_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: smax_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = call <128 x i16> @llvm.smax.v128i16(<128 x i16> %op1, <128 x i16> %op2)
@@ -207,31 +206,34 @@ define void @smax_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v2i32:
-; CHECK: smax v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v4i32:
-; CHECK: smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
 
-define void @smax_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @smax_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: smax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -240,26 +242,28 @@ define void @smax_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @smax_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: smax_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: smax [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: smax [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smax_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smax z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    smax z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: smax_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smax z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %op1, <16 x i32> %op2)
@@ -267,14 +271,15 @@ define void @smax_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @smax_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @smax_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: smax_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = call <32 x i32> @llvm.smax.v32i32(<32 x i32> %op1, <32 x i32> %op2)
@@ -282,14 +287,15 @@ define void @smax_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @smax_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @smax_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: smax_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = call <64 x i32> @llvm.smax.v64i32(<64 x i32> %op1, <64 x i32> %op2)
@@ -298,33 +304,42 @@ define void @smax_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Vector i64 max are not legal for NEON so use SVE when available.
-define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: smax z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
 
 ; Vector i64 max are not legal for NEON so use SVE when available.
-define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: smax z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
 
-define void @smax_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @smax_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: smax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -333,26 +348,28 @@ define void @smax_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @smax_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: smax_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: smax [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: smax [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smax_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smax z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    smax z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: smax_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %op1, <8 x i64> %op2)
@@ -360,14 +377,15 @@ define void @smax_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @smax_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @smax_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: smax_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = call <16 x i64> @llvm.smax.v16i64(<16 x i64> %op1, <16 x i64> %op2)
@@ -375,14 +393,15 @@ define void @smax_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @smax_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @smax_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: smax_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = call <32 x i64> @llvm.smax.v32i64(<32 x i64> %op1, <32 x i64> %op2)
@@ -395,31 +414,34 @@ define void @smax_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v8i8:
-; CHECK: smin v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smin v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v16i8:
-; CHECK: smin v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smin v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
 
-define void @smin_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @smin_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: smin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -428,25 +450,28 @@ define void @smin_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @smin_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: smin_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: smin_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smin z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT:    smin z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: smin [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: smin [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
+; VBITS_GE_512-LABEL: smin_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smin z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %op1, <64 x i8> %op2)
@@ -454,14 +479,15 @@ define void @smin_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @smin_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @smin_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: smin_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = call <128 x i8> @llvm.smin.v128i8(<128 x i8> %op1, <128 x i8> %op2)
@@ -469,14 +495,15 @@ define void @smin_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @smin_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @smin_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: smin_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = call <256 x i8> @llvm.smin.v256i8(<256 x i8> %op1, <256 x i8> %op2)
@@ -485,31 +512,34 @@ define void @smin_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v4i16:
-; CHECK: smin v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smin v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v8i16:
-; CHECK: smin v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smin v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
 
-define void @smin_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @smin_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: smin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -518,26 +548,28 @@ define void @smin_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @smin_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: smin_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: smin [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: smin [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smin_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smin z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    smin z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: smin_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smin z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %op1, <32 x i16> %op2)
@@ -545,14 +577,15 @@ define void @smin_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @smin_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @smin_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: smin_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = call <64 x i16> @llvm.smin.v64i16(<64 x i16> %op1, <64 x i16> %op2)
@@ -560,14 +593,15 @@ define void @smin_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @smin_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @smin_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: smin_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = call <128 x i16> @llvm.smin.v128i16(<128 x i16> %op1, <128 x i16> %op2)
@@ -576,31 +610,34 @@ define void @smin_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v2i32:
-; CHECK: smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v4i32:
-; CHECK: smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
 
-define void @smin_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @smin_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: smin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -609,26 +646,28 @@ define void @smin_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @smin_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: smin_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: smin [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: smin [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smin_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smin z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    smin z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: smin_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smin z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %op1, <16 x i32> %op2)
@@ -636,14 +675,15 @@ define void @smin_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @smin_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @smin_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: smin_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = call <32 x i32> @llvm.smin.v32i32(<32 x i32> %op1, <32 x i32> %op2)
@@ -651,14 +691,15 @@ define void @smin_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @smin_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @smin_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: smin_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = call <64 x i32> @llvm.smin.v64i32(<64 x i32> %op1, <64 x i32> %op2)
@@ -667,33 +708,42 @@ define void @smin_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Vector i64 min are not legal for NEON so use SVE when available.
-define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: smin z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
 
 ; Vector i64 min are not legal for NEON so use SVE when available.
-define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: smin z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
 
-define void @smin_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @smin_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: smin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -702,26 +752,28 @@ define void @smin_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @smin_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: smin_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: smin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: smin [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: smin [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smin_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smin z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    smin z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: smin_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smin z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %op1, <8 x i64> %op2)
@@ -729,14 +781,15 @@ define void @smin_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @smin_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @smin_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: smin_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: smin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = call <16 x i64> @llvm.smin.v16i64(<16 x i64> %op1, <16 x i64> %op2)
@@ -744,14 +797,15 @@ define void @smin_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @smin_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @smin_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: smin_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: smin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = call <32 x i64> @llvm.smin.v32i64(<32 x i64> %op1, <32 x i64> %op2)
@@ -764,31 +818,34 @@ define void @smin_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v8i8:
-; CHECK: umax v0.8b, v0.8b, v1.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umax v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v16i8:
-; CHECK: umax v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umax v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
 
-define void @umax_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @umax_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -797,26 +854,28 @@ define void @umax_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @umax_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: umax_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: umax_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umax z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT:    umax z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_512-LABEL: umax_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umax z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %op1, <64 x i8> %op2)
@@ -824,14 +883,15 @@ define void @umax_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @umax_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @umax_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: umax_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = call <128 x i8> @llvm.umax.v128i8(<128 x i8> %op1, <128 x i8> %op2)
@@ -839,14 +899,15 @@ define void @umax_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @umax_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @umax_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: umax_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = call <256 x i8> @llvm.umax.v256i8(<256 x i8> %op1, <256 x i8> %op2)
@@ -855,31 +916,34 @@ define void @umax_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v4i16:
-; CHECK: umax v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umax v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v8i16:
-; CHECK: umax v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umax v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
 
-define void @umax_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @umax_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -888,26 +952,28 @@ define void @umax_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @umax_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: umax_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umax_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umax z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    umax z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: umax_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umax z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %op1, <32 x i16> %op2)
@@ -915,14 +981,15 @@ define void @umax_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @umax_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @umax_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: umax_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = call <64 x i16> @llvm.umax.v64i16(<64 x i16> %op1, <64 x i16> %op2)
@@ -930,14 +997,15 @@ define void @umax_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @umax_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @umax_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: umax_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = call <128 x i16> @llvm.umax.v128i16(<128 x i16> %op1, <128 x i16> %op2)
@@ -946,31 +1014,34 @@ define void @umax_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v2i32:
-; CHECK: umax v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v4i32:
-; CHECK: umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
 
-define void @umax_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @umax_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -979,26 +1050,28 @@ define void @umax_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @umax_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: umax_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umax_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umax z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    umax z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: umax_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umax z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %op1, <16 x i32> %op2)
@@ -1006,14 +1079,15 @@ define void @umax_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @umax_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @umax_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: umax_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = call <32 x i32> @llvm.umax.v32i32(<32 x i32> %op1, <32 x i32> %op2)
@@ -1021,14 +1095,15 @@ define void @umax_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @umax_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @umax_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: umax_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = call <64 x i32> @llvm.umax.v64i32(<64 x i32> %op1, <64 x i32> %op2)
@@ -1037,33 +1112,42 @@ define void @umax_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Vector i64 max are not legal for NEON so use SVE when available.
-define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: umax z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
 
 ; Vector i64 max are not legal for NEON so use SVE when available.
-define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: umax z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
 
-define void @umax_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @umax_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -1072,26 +1156,28 @@ define void @umax_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @umax_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: umax_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umax_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umax z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    umax z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: umax_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umax z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %op1, <8 x i64> %op2)
@@ -1099,14 +1185,15 @@ define void @umax_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @umax_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @umax_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: umax_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = call <16 x i64> @llvm.umax.v16i64(<16 x i64> %op1, <16 x i64> %op2)
@@ -1114,14 +1201,15 @@ define void @umax_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @umax_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @umax_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: umax_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = call <32 x i64> @llvm.umax.v32i64(<32 x i64> %op1, <32 x i64> %op2)
@@ -1134,31 +1222,34 @@ define void @umax_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v8i8:
-; CHECK: umin v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umin v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v16i8:
-; CHECK: umin v0.16b, v0.16b, v1.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umin v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
 
-define void @umin_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @umin_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -1167,25 +1258,28 @@ define void @umin_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @umin_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: umin_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: umin_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umin z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT:    umin z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
+; VBITS_GE_512-LABEL: umin_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umin z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %op1, <64 x i8> %op2)
@@ -1193,14 +1287,15 @@ define void @umin_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @umin_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @umin_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: umin_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = call <128 x i8> @llvm.umin.v128i8(<128 x i8> %op1, <128 x i8> %op2)
@@ -1208,14 +1303,15 @@ define void @umin_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @umin_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @umin_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: umin_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = call <256 x i8> @llvm.umin.v256i8(<256 x i8> %op1, <256 x i8> %op2)
@@ -1224,31 +1320,34 @@ define void @umin_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v4i16:
-; CHECK: umin v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umin v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v8i16:
-; CHECK: umin v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umin v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
 
-define void @umin_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @umin_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -1257,26 +1356,28 @@ define void @umin_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @umin_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: umin_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umin_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umin z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    umin z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: umin_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umin z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %op1, <32 x i16> %op2)
@@ -1284,14 +1385,15 @@ define void @umin_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @umin_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @umin_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: umin_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = call <64 x i16> @llvm.umin.v64i16(<64 x i16> %op1, <64 x i16> %op2)
@@ -1299,14 +1401,15 @@ define void @umin_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @umin_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @umin_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: umin_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = call <128 x i16> @llvm.umin.v128i16(<128 x i16> %op1, <128 x i16> %op2)
@@ -1315,31 +1418,34 @@ define void @umin_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v2i32:
-; CHECK: umin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v4i32:
-; CHECK: umin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
 
-define void @umin_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @umin_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -1348,26 +1454,28 @@ define void @umin_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @umin_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: umin_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umin_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umin z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    umin z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: umin_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umin z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %op1, <16 x i32> %op2)
@@ -1375,14 +1483,15 @@ define void @umin_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @umin_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @umin_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: umin_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = call <32 x i32> @llvm.umin.v32i32(<32 x i32> %op1, <32 x i32> %op2)
@@ -1390,14 +1499,15 @@ define void @umin_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @umin_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @umin_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: umin_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = call <64 x i32> @llvm.umin.v64i32(<64 x i32> %op1, <64 x i32> %op2)
@@ -1406,33 +1516,42 @@ define void @umin_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Vector i64 min are not legal for NEON so use SVE when available.
-define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: umin z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
 
 ; Vector i64 min are not legal for NEON so use SVE when available.
-define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: umin z0.d, [[PG]]/m, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
 
-define void @umin_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @umin_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -1441,26 +1560,28 @@ define void @umin_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @umin_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: umin_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umin_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umin z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    umin z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: umin_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umin z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %op1, <8 x i64> %op2)
@@ -1468,14 +1589,15 @@ define void @umin_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @umin_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @umin_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: umin_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = call <16 x i64> @llvm.umin.v16i64(<16 x i64> %op1, <16 x i64> %op2)
@@ -1483,14 +1605,15 @@ define void @umin_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @umin_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @umin_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: umin_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = call <32 x i64> @llvm.umin.v32i64(<32 x i64> %op1, <32 x i64> %op2)
@@ -1599,4 +1722,3 @@ declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>)
 declare <8 x i64> @llvm.umax.v8i64(<8 x i64>, <8 x i64>)
 declare <16 x i64> @llvm.umax.v16i64(<16 x i64>, <16 x i64>)
 declare <32 x i64> @llvm.umax.v32i64(<32 x i64>, <32 x i64>)
-

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
index 32dc75e19d7f..b050a4dcfcdb 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
@@ -1,25 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefixes=CHECK,VBITS_EQ_128
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
-
-; VBYTES represents the useful byte size of a vector register from the code
-; generator's point of view. It is clamped to power-of-2 values because
-; only power-of-2 vector lengths are considered legal, regardless of the
-; user specified vector length.
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 ; This test only tests the legal types for a given vector width, as mulh nodes
 ; do not get generated for non-legal types.
@@ -32,35 +14,29 @@ target triple = "aarch64-unknown-linux-gnu"
 
 ; Don't use SVE for 64-bit vectors.
 ; FIXME: The codegen for the >=256 bits case can be improved.
-define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
-; VBITS_EQ_128-LABEL: smulh_v8i8:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    smull v0.8h, v0.8b, v1.8b
-; VBITS_EQ_128-NEXT:    shrn v0.8b, v0.8h, #8
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: smulh_v8i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    smull v0.8h, v0.8b, v1.8b
-; VBITS_GE_256-NEXT:    ushr v1.8h, v0.8h, #8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[0]
-; VBITS_GE_256-NEXT:    umov w9, v1.h[1]
-; VBITS_GE_256-NEXT:    fmov s0, w8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[2]
-; VBITS_GE_256-NEXT:    mov v0.b[1], w9
-; VBITS_GE_256-NEXT:    mov v0.b[2], w8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[3]
-; VBITS_GE_256-NEXT:    mov v0.b[3], w8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[4]
-; VBITS_GE_256-NEXT:    mov v0.b[4], w8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[5]
-; VBITS_GE_256-NEXT:    mov v0.b[5], w8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[6]
-; VBITS_GE_256-NEXT:    mov v0.b[6], w8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[7]
-; VBITS_GE_256-NEXT:    mov v0.b[7], w8
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; VBITS_GE_256-NEXT:    ret
+define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ushr v1.8h, v0.8h, #8
+; CHECK-NEXT:    umov w8, v1.h[0]
+; CHECK-NEXT:    umov w9, v1.h[1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    umov w8, v1.h[2]
+; CHECK-NEXT:    mov v0.b[1], w9
+; CHECK-NEXT:    mov v0.b[2], w8
+; CHECK-NEXT:    umov w8, v1.h[3]
+; CHECK-NEXT:    mov v0.b[3], w8
+; CHECK-NEXT:    umov w8, v1.h[4]
+; CHECK-NEXT:    mov v0.b[4], w8
+; CHECK-NEXT:    umov w8, v1.h[5]
+; CHECK-NEXT:    mov v0.b[5], w8
+; CHECK-NEXT:    umov w8, v1.h[6]
+; CHECK-NEXT:    mov v0.b[6], w8
+; CHECK-NEXT:    umov w8, v1.h[7]
+; CHECK-NEXT:    mov v0.b[7], w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 8, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
   %1 = sext <8 x i8> %op1 to <8 x i16>
@@ -72,7 +48,7 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smulh_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    smull2 v2.8h, v0.16b, v1.16b
@@ -87,30 +63,15 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
   ret <16 x i8> %res
 }
 
-define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v32i8:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT:    smull v4.8h, v0.8b, v2.8b
-; VBITS_EQ_128-NEXT:    smull2 v0.8h, v0.16b, v2.16b
-; VBITS_EQ_128-NEXT:    smull v5.8h, v1.8b, v3.8b
-; VBITS_EQ_128-NEXT:    smull2 v1.8h, v1.16b, v3.16b
-; VBITS_EQ_128-NEXT:    shrn v2.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v3.8b, v5.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v2.16b, v0.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v3.16b, v1.8h, #8
-; VBITS_EQ_128-NEXT:    stp q2, q3, [x0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: smulh_v32i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
+define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %1 = sext <32 x i8> %op1 to <32 x i16>
@@ -123,40 +84,56 @@ define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v64i8:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT:    ldp q2, q5, [x1, #32]
-; VBITS_EQ_128-NEXT:    smull2 v6.8h, v0.16b, v2.16b
-; VBITS_EQ_128-NEXT:    smull v0.8h, v0.8b, v2.8b
-; VBITS_EQ_128-NEXT:    ldp q3, q4, [x0]
-; VBITS_EQ_128-NEXT:    smull2 v7.8h, v1.16b, v5.16b
-; VBITS_EQ_128-NEXT:    smull v1.8h, v1.8b, v5.8b
-; VBITS_EQ_128-NEXT:    shrn v0.8b, v0.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v0.16b, v6.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q2, q5, [x1]
-; VBITS_EQ_128-NEXT:    shrn v1.8b, v1.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v1.16b, v7.8h, #8
-; VBITS_EQ_128-NEXT:    smull2 v16.8h, v3.16b, v2.16b
-; VBITS_EQ_128-NEXT:    smull v2.8h, v3.8b, v2.8b
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT:    smull2 v3.8h, v4.16b, v5.16b
-; VBITS_EQ_128-NEXT:    smull v4.8h, v4.8b, v5.8b
-; VBITS_EQ_128-NEXT:    shrn v2.8b, v2.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v4.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v2.16b, v16.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v4.16b, v3.8h, #8
-; VBITS_EQ_128-NEXT:    stp q2, q4, [x0]
-; VBITS_EQ_128-NEXT:    ret
+; VBITS_GE_256-LABEL: smulh_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sunpklo z4.h, z0.b
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z5.h, z1.b
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z6.h, z2.b
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z7.h, z3.b
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    sunpklo z2.h, z2.b
+; VBITS_GE_256-NEXT:    sunpklo z3.h, z3.b
+; VBITS_GE_256-NEXT:    mul z0.h, p1/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    movprfx z2, z5
+; VBITS_GE_256-NEXT:    mul z2.h, p1/m, z2.h, z7.h
+; VBITS_GE_256-NEXT:    mul z1.h, p1/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    mul z4.h, p1/m, z4.h, z6.h
+; VBITS_GE_256-NEXT:    lsr z0.h, p1/m, z0.h, #8
+; VBITS_GE_256-NEXT:    movprfx z3, z4
+; VBITS_GE_256-NEXT:    lsr z3.h, p1/m, z3.h, #8
+; VBITS_GE_256-NEXT:    lsr z1.h, p1/m, z1.h, #8
+; VBITS_GE_256-NEXT:    lsr z2.h, p1/m, z2.h, #8
+; VBITS_GE_256-NEXT:    uzp1 z3.b, z3.b, z3.b
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
+; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    splice z3.b, p1, z3.b, z0.b
+; VBITS_GE_256-NEXT:    splice z2.b, p1, z2.b, z1.b
+; VBITS_GE_256-NEXT:    st1b { z3.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; VBITS_GE_1024-LABEL: smulh_v64i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.b, vl64
-; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_1024-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+; VBITS_GE_512-LABEL: smulh_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %insert = insertelement <64 x i16> undef, i16 8, i64 0
@@ -170,64 +147,15 @@ define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v128i8:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    ldp q2, q5, [x1, #96]
-; VBITS_EQ_128-NEXT:    smull2 v6.8h, v0.16b, v2.16b
-; VBITS_EQ_128-NEXT:    smull v0.8h, v0.8b, v2.8b
-; VBITS_EQ_128-NEXT:    ldp q3, q4, [x0, #64]
-; VBITS_EQ_128-NEXT:    smull2 v7.8h, v1.16b, v5.16b
-; VBITS_EQ_128-NEXT:    smull v1.8h, v1.8b, v5.8b
-; VBITS_EQ_128-NEXT:    shrn v0.8b, v0.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v0.16b, v6.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q2, q16, [x1, #64]
-; VBITS_EQ_128-NEXT:    shrn v1.8b, v1.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v1.16b, v7.8h, #8
-; VBITS_EQ_128-NEXT:    smull2 v17.8h, v3.16b, v2.16b
-; VBITS_EQ_128-NEXT:    smull v2.8h, v3.8b, v2.8b
-; VBITS_EQ_128-NEXT:    ldp q5, q18, [x0, #32]
-; VBITS_EQ_128-NEXT:    smull2 v19.8h, v4.16b, v16.16b
-; VBITS_EQ_128-NEXT:    smull v4.8h, v4.8b, v16.8b
-; VBITS_EQ_128-NEXT:    shrn v2.8b, v2.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v2.16b, v17.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q3, q20, [x1, #32]
-; VBITS_EQ_128-NEXT:    shrn v4.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v4.16b, v19.8h, #8
-; VBITS_EQ_128-NEXT:    smull2 v21.8h, v5.16b, v3.16b
-; VBITS_EQ_128-NEXT:    smull v3.8h, v5.8b, v3.8b
-; VBITS_EQ_128-NEXT:    ldp q16, q22, [x0]
-; VBITS_EQ_128-NEXT:    smull2 v23.8h, v18.16b, v20.16b
-; VBITS_EQ_128-NEXT:    smull v18.8h, v18.8b, v20.8b
-; VBITS_EQ_128-NEXT:    shrn v3.8b, v3.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v3.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q5, q24, [x1]
-; VBITS_EQ_128-NEXT:    shrn v18.8b, v18.8h, #8
-; VBITS_EQ_128-NEXT:    stp q2, q4, [x0, #64]
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    shrn2 v18.16b, v23.8h, #8
-; VBITS_EQ_128-NEXT:    smull v20.8h, v16.8b, v5.8b
-; VBITS_EQ_128-NEXT:    smull2 v5.8h, v16.16b, v5.16b
-; VBITS_EQ_128-NEXT:    stp q3, q18, [x0, #32]
-; VBITS_EQ_128-NEXT:    smull v25.8h, v22.8b, v24.8b
-; VBITS_EQ_128-NEXT:    smull2 v16.8h, v22.16b, v24.16b
-; VBITS_EQ_128-NEXT:    shrn v20.8b, v20.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v22.8b, v25.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v20.16b, v5.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v22.16b, v16.8h, #8
-; VBITS_EQ_128-NEXT:    stp q20, q22, [x0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: smulh_v128i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_1024-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: smulh_v128i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %1 = sext <128 x i8> %op1 to <128 x i16>
@@ -239,130 +167,15 @@ define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v256i8:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    sub sp, sp, #96
-; VBITS_EQ_128-NEXT:    .cfi_def_cfa_offset 96
-; VBITS_EQ_128-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_offset b8, -8
-; VBITS_EQ_128-NEXT:    .cfi_offset b9, -16
-; VBITS_EQ_128-NEXT:    .cfi_offset b10, -24
-; VBITS_EQ_128-NEXT:    .cfi_offset b11, -32
-; VBITS_EQ_128-NEXT:    .cfi_offset b12, -40
-; VBITS_EQ_128-NEXT:    .cfi_offset b13, -48
-; VBITS_EQ_128-NEXT:    .cfi_offset b14, -56
-; VBITS_EQ_128-NEXT:    .cfi_offset b15, -64
-; VBITS_EQ_128-NEXT:    ldp q2, q1, [x0, #224]
-; VBITS_EQ_128-NEXT:    ldp q6, q3, [x1, #224]
-; VBITS_EQ_128-NEXT:    ldp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT:    smull2 v0.8h, v1.16b, v3.16b
-; VBITS_EQ_128-NEXT:    smull v4.8h, v1.8b, v3.8b
-; VBITS_EQ_128-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    ldp q16, q3, [x1, #192]
-; VBITS_EQ_128-NEXT:    smull2 v0.8h, v2.16b, v6.16b
-; VBITS_EQ_128-NEXT:    shrn v4.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT:    smull v6.8h, v2.8b, v6.8b
-; VBITS_EQ_128-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    smull2 v2.8h, v5.16b, v3.16b
-; VBITS_EQ_128-NEXT:    shrn v6.8b, v6.8h, #8
-; VBITS_EQ_128-NEXT:    smull v5.8h, v5.8b, v3.8b
-; VBITS_EQ_128-NEXT:    ldp q19, q18, [x0, #160]
-; VBITS_EQ_128-NEXT:    smull2 v3.8h, v7.16b, v16.16b
-; VBITS_EQ_128-NEXT:    smull v7.8h, v7.8b, v16.8b
-; VBITS_EQ_128-NEXT:    shrn v5.8b, v5.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v5.16b, v2.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q16, q17, [x1, #160]
-; VBITS_EQ_128-NEXT:    shrn v7.8b, v7.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v7.16b, v3.8h, #8
-; VBITS_EQ_128-NEXT:    smull2 v31.8h, v19.16b, v16.16b
-; VBITS_EQ_128-NEXT:    smull v9.8h, v19.8b, v16.8b
-; VBITS_EQ_128-NEXT:    smull2 v21.8h, v18.16b, v17.16b
-; VBITS_EQ_128-NEXT:    smull v30.8h, v18.8b, v17.8b
-; VBITS_EQ_128-NEXT:    ldp q22, q17, [x0, #128]
-; VBITS_EQ_128-NEXT:    shrn v9.8b, v9.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v30.8b, v30.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v9.16b, v31.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v30.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q19, q20, [x1, #128]
-; VBITS_EQ_128-NEXT:    smull2 v16.8h, v17.16b, v20.16b
-; VBITS_EQ_128-NEXT:    ldr q21, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    smull v18.8h, v17.8b, v20.8b
-; VBITS_EQ_128-NEXT:    ldp q24, q20, [x0, #96]
-; VBITS_EQ_128-NEXT:    smull2 v17.8h, v22.16b, v19.16b
-; VBITS_EQ_128-NEXT:    shrn2 v4.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT:    smull v19.8h, v22.8b, v19.8b
-; VBITS_EQ_128-NEXT:    shrn v2.8b, v18.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v2.16b, v16.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q22, q23, [x1, #96]
-; VBITS_EQ_128-NEXT:    shrn v3.8b, v19.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v3.16b, v17.8h, #8
-; VBITS_EQ_128-NEXT:    smull2 v12.8h, v24.16b, v22.16b
-; VBITS_EQ_128-NEXT:    smull v13.8h, v24.8b, v22.8b
-; VBITS_EQ_128-NEXT:    smull2 v10.8h, v20.16b, v23.16b
-; VBITS_EQ_128-NEXT:    ldr q21, [sp] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    smull v11.8h, v20.8b, v23.8b
-; VBITS_EQ_128-NEXT:    ldp q26, q23, [x0, #64]
-; VBITS_EQ_128-NEXT:    shrn2 v6.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q24, q25, [x1, #64]
-; VBITS_EQ_128-NEXT:    smull2 v22.8h, v26.16b, v24.16b
-; VBITS_EQ_128-NEXT:    smull v24.8h, v26.8b, v24.8b
-; VBITS_EQ_128-NEXT:    smull2 v20.8h, v23.16b, v25.16b
-; VBITS_EQ_128-NEXT:    smull v23.8h, v23.8b, v25.8b
-; VBITS_EQ_128-NEXT:    ldp q28, q25, [x0, #32]
-; VBITS_EQ_128-NEXT:    ldp q26, q27, [x1, #32]
-; VBITS_EQ_128-NEXT:    smull2 v15.8h, v28.16b, v26.16b
-; VBITS_EQ_128-NEXT:    smull v1.8h, v28.8b, v26.8b
-; VBITS_EQ_128-NEXT:    smull2 v14.8h, v25.16b, v27.16b
-; VBITS_EQ_128-NEXT:    smull v8.8h, v25.8b, v27.8b
-; VBITS_EQ_128-NEXT:    ldp q0, q27, [x0]
-; VBITS_EQ_128-NEXT:    shrn v8.8b, v8.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v8.16b, v14.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q28, q29, [x1]
-; VBITS_EQ_128-NEXT:    stp q3, q2, [x0, #128]
-; VBITS_EQ_128-NEXT:    shrn v2.8b, v23.8h, #8
-; VBITS_EQ_128-NEXT:    stp q9, q30, [x0, #160]
-; VBITS_EQ_128-NEXT:    shrn v3.8b, v24.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v2.16b, v20.8h, #8
-; VBITS_EQ_128-NEXT:    stp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT:    smull2 v26.8h, v0.16b, v28.16b
-; VBITS_EQ_128-NEXT:    shrn2 v3.16b, v22.8h, #8
-; VBITS_EQ_128-NEXT:    smull v28.8h, v0.8b, v28.8b
-; VBITS_EQ_128-NEXT:    stp q6, q4, [x0, #224]
-; VBITS_EQ_128-NEXT:    smull2 v25.8h, v27.16b, v29.16b
-; VBITS_EQ_128-NEXT:    stp q3, q2, [x0, #64]
-; VBITS_EQ_128-NEXT:    smull v27.8h, v27.8b, v29.8b
-; VBITS_EQ_128-NEXT:    shrn v29.8b, v1.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v0.8b, v13.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v1.8b, v11.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v29.16b, v15.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v0.16b, v12.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v1.16b, v10.8h, #8
-; VBITS_EQ_128-NEXT:    stp q29, q8, [x0, #32]
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    shrn v0.8b, v27.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v1.8b, v28.8h, #8
-; VBITS_EQ_128-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    shrn2 v0.16b, v25.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v1.16b, v26.8h, #8
-; VBITS_EQ_128-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    stp q1, q0, [x0]
-; VBITS_EQ_128-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add sp, sp, #96
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: smulh_v256i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_2048-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: smulh_v256i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %1 = sext <256 x i8> %op1 to <256 x i16>
@@ -376,26 +189,20 @@ define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 
 ; Don't use SVE for 64-bit vectors.
 ; FIXME: The codegen for the >=256 bits case can be improved.
-define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
-; VBITS_EQ_128-LABEL: smulh_v4i16:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    smull v0.4s, v0.4h, v1.4h
-; VBITS_EQ_128-NEXT:    shrn v0.4h, v0.4s, #16
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: smulh_v4i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    smull v0.4s, v0.4h, v1.4h
-; VBITS_GE_256-NEXT:    ushr v1.4s, v0.4s, #16
-; VBITS_GE_256-NEXT:    mov w8, v1.s[1]
-; VBITS_GE_256-NEXT:    mov w9, v1.s[2]
-; VBITS_GE_256-NEXT:    mov v0.16b, v1.16b
-; VBITS_GE_256-NEXT:    mov v0.h[1], w8
-; VBITS_GE_256-NEXT:    mov w8, v1.s[3]
-; VBITS_GE_256-NEXT:    mov v0.h[2], w9
-; VBITS_GE_256-NEXT:    mov v0.h[3], w8
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; VBITS_GE_256-NEXT:    ret
+define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    ushr v1.4s, v0.4s, #16
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    mov w9, v1.s[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.h[1], w8
+; CHECK-NEXT:    mov w8, v1.s[3]
+; CHECK-NEXT:    mov v0.h[2], w9
+; CHECK-NEXT:    mov v0.h[3], w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %1 = sext <4 x i16> %op1 to <4 x i32>
   %2 = sext <4 x i16> %op2 to <4 x i32>
   %mul = mul <4 x i32> %1, %2
@@ -405,7 +212,7 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smulh_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    smull2 v2.4s, v0.8h, v1.8h
@@ -420,30 +227,15 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
   ret <8 x i16> %res
 }
 
-define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v16i16:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT:    smull v4.4s, v0.4h, v2.4h
-; VBITS_EQ_128-NEXT:    smull2 v0.4s, v0.8h, v2.8h
-; VBITS_EQ_128-NEXT:    smull v5.4s, v1.4h, v3.4h
-; VBITS_EQ_128-NEXT:    smull2 v1.4s, v1.8h, v3.8h
-; VBITS_EQ_128-NEXT:    shrn v2.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v3.4h, v5.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v2.8h, v0.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v3.8h, v1.4s, #16
-; VBITS_EQ_128-NEXT:    stp q2, q3, [x0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: smulh_v16i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
+define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %1 = sext <16 x i16> %op1 to <16 x i32>
@@ -456,40 +248,47 @@ define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v32i16:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT:    ldp q2, q5, [x1, #32]
-; VBITS_EQ_128-NEXT:    smull2 v6.4s, v0.8h, v2.8h
-; VBITS_EQ_128-NEXT:    smull v0.4s, v0.4h, v2.4h
-; VBITS_EQ_128-NEXT:    ldp q3, q4, [x0]
-; VBITS_EQ_128-NEXT:    smull2 v7.4s, v1.8h, v5.8h
-; VBITS_EQ_128-NEXT:    smull v1.4s, v1.4h, v5.4h
-; VBITS_EQ_128-NEXT:    shrn v0.4h, v0.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v0.8h, v6.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q2, q5, [x1]
-; VBITS_EQ_128-NEXT:    shrn v1.4h, v1.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v1.8h, v7.4s, #16
-; VBITS_EQ_128-NEXT:    smull2 v16.4s, v3.8h, v2.8h
-; VBITS_EQ_128-NEXT:    smull v2.4s, v3.4h, v2.4h
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT:    smull2 v3.4s, v4.8h, v5.8h
-; VBITS_EQ_128-NEXT:    smull v4.4s, v4.4h, v5.4h
-; VBITS_EQ_128-NEXT:    shrn v2.4h, v2.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v4.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v2.8h, v16.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v4.8h, v3.4s, #16
-; VBITS_EQ_128-NEXT:    stp q2, q4, [x0]
-; VBITS_EQ_128-NEXT:    ret
+; VBITS_GE_256-LABEL: smulh_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov z7.d, z1.d
+; VBITS_GE_256-NEXT:    mov z16.d, z3.d
+; VBITS_GE_256-NEXT:    ext z7.b, z7.b, z7.b, #16
+; VBITS_GE_256-NEXT:    smull2 v4.4s, v0.8h, v2.8h
+; VBITS_GE_256-NEXT:    ext z16.b, z16.b, z3.b, #16
+; VBITS_GE_256-NEXT:    smull v5.4s, v0.4h, v2.4h
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    smull2 v6.4s, v1.8h, v3.8h
+; VBITS_GE_256-NEXT:    smull v1.4s, v1.4h, v3.4h
+; VBITS_GE_256-NEXT:    smull2 v3.4s, v0.8h, v2.8h
+; VBITS_GE_256-NEXT:    smull v0.4s, v0.4h, v2.4h
+; VBITS_GE_256-NEXT:    smull2 v2.4s, v7.8h, v16.8h
+; VBITS_GE_256-NEXT:    smull v7.4s, v7.4h, v16.4h
+; VBITS_GE_256-NEXT:    uzp2 v4.8h, v5.8h, v4.8h
+; VBITS_GE_256-NEXT:    uzp2 v1.8h, v1.8h, v6.8h
+; VBITS_GE_256-NEXT:    uzp2 v0.8h, v0.8h, v3.8h
+; VBITS_GE_256-NEXT:    uzp2 v2.8h, v7.8h, v2.8h
+; VBITS_GE_256-NEXT:    splice z4.h, p1, z4.h, z0.h
+; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z2.h
+; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; VBITS_GE_1024-LABEL: smulh_v32i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+; VBITS_GE_512-LABEL: smulh_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %1 = sext <32 x i16> %op1 to <32 x i32>
@@ -501,63 +300,15 @@ define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v64i16:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    ldp q2, q5, [x1, #96]
-; VBITS_EQ_128-NEXT:    smull2 v6.4s, v0.8h, v2.8h
-; VBITS_EQ_128-NEXT:    smull v0.4s, v0.4h, v2.4h
-; VBITS_EQ_128-NEXT:    ldp q3, q4, [x0, #64]
-; VBITS_EQ_128-NEXT:    smull2 v7.4s, v1.8h, v5.8h
-; VBITS_EQ_128-NEXT:    smull v1.4s, v1.4h, v5.4h
-; VBITS_EQ_128-NEXT:    shrn v0.4h, v0.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v0.8h, v6.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q2, q16, [x1, #64]
-; VBITS_EQ_128-NEXT:    shrn v1.4h, v1.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v1.8h, v7.4s, #16
-; VBITS_EQ_128-NEXT:    smull2 v17.4s, v3.8h, v2.8h
-; VBITS_EQ_128-NEXT:    smull v2.4s, v3.4h, v2.4h
-; VBITS_EQ_128-NEXT:    ldp q5, q18, [x0, #32]
-; VBITS_EQ_128-NEXT:    smull2 v19.4s, v4.8h, v16.8h
-; VBITS_EQ_128-NEXT:    smull v4.4s, v4.4h, v16.4h
-; VBITS_EQ_128-NEXT:    shrn v2.4h, v2.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v2.8h, v17.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q3, q20, [x1, #32]
-; VBITS_EQ_128-NEXT:    shrn v4.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v4.8h, v19.4s, #16
-; VBITS_EQ_128-NEXT:    smull2 v21.4s, v5.8h, v3.8h
-; VBITS_EQ_128-NEXT:    smull v3.4s, v5.4h, v3.4h
-; VBITS_EQ_128-NEXT:    ldp q16, q22, [x0]
-; VBITS_EQ_128-NEXT:    smull2 v23.4s, v18.8h, v20.8h
-; VBITS_EQ_128-NEXT:    smull v18.4s, v18.4h, v20.4h
-; VBITS_EQ_128-NEXT:    shrn v3.4h, v3.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v3.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q5, q24, [x1]
-; VBITS_EQ_128-NEXT:    shrn v18.4h, v18.4s, #16
-; VBITS_EQ_128-NEXT:    stp q2, q4, [x0, #64]
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    shrn2 v18.8h, v23.4s, #16
-; VBITS_EQ_128-NEXT:    smull v20.4s, v16.4h, v5.4h
-; VBITS_EQ_128-NEXT:    smull2 v5.4s, v16.8h, v5.8h
-; VBITS_EQ_128-NEXT:    stp q3, q18, [x0, #32]
-; VBITS_EQ_128-NEXT:    smull v25.4s, v22.4h, v24.4h
-; VBITS_EQ_128-NEXT:    smull2 v16.4s, v22.8h, v24.8h
-; VBITS_EQ_128-NEXT:    shrn v20.4h, v20.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v22.4h, v25.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v20.8h, v5.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v22.8h, v16.4s, #16
-; VBITS_EQ_128-NEXT:    stp q20, q22, [x0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: smulh_v64i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: smulh_v64i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %1 = sext <64 x i16> %op1 to <64 x i32>
@@ -569,130 +320,15 @@ define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v128i16:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    sub sp, sp, #96
-; VBITS_EQ_128-NEXT:    .cfi_def_cfa_offset 96
-; VBITS_EQ_128-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_offset b8, -8
-; VBITS_EQ_128-NEXT:    .cfi_offset b9, -16
-; VBITS_EQ_128-NEXT:    .cfi_offset b10, -24
-; VBITS_EQ_128-NEXT:    .cfi_offset b11, -32
-; VBITS_EQ_128-NEXT:    .cfi_offset b12, -40
-; VBITS_EQ_128-NEXT:    .cfi_offset b13, -48
-; VBITS_EQ_128-NEXT:    .cfi_offset b14, -56
-; VBITS_EQ_128-NEXT:    .cfi_offset b15, -64
-; VBITS_EQ_128-NEXT:    ldp q2, q1, [x0, #224]
-; VBITS_EQ_128-NEXT:    ldp q6, q3, [x1, #224]
-; VBITS_EQ_128-NEXT:    ldp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT:    smull2 v0.4s, v1.8h, v3.8h
-; VBITS_EQ_128-NEXT:    smull v4.4s, v1.4h, v3.4h
-; VBITS_EQ_128-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    ldp q16, q3, [x1, #192]
-; VBITS_EQ_128-NEXT:    smull2 v0.4s, v2.8h, v6.8h
-; VBITS_EQ_128-NEXT:    shrn v4.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT:    smull v6.4s, v2.4h, v6.4h
-; VBITS_EQ_128-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    smull2 v2.4s, v5.8h, v3.8h
-; VBITS_EQ_128-NEXT:    shrn v6.4h, v6.4s, #16
-; VBITS_EQ_128-NEXT:    smull v5.4s, v5.4h, v3.4h
-; VBITS_EQ_128-NEXT:    ldp q19, q18, [x0, #160]
-; VBITS_EQ_128-NEXT:    smull2 v3.4s, v7.8h, v16.8h
-; VBITS_EQ_128-NEXT:    smull v7.4s, v7.4h, v16.4h
-; VBITS_EQ_128-NEXT:    shrn v5.4h, v5.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v5.8h, v2.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q16, q17, [x1, #160]
-; VBITS_EQ_128-NEXT:    shrn v7.4h, v7.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v7.8h, v3.4s, #16
-; VBITS_EQ_128-NEXT:    smull2 v31.4s, v19.8h, v16.8h
-; VBITS_EQ_128-NEXT:    smull v9.4s, v19.4h, v16.4h
-; VBITS_EQ_128-NEXT:    smull2 v21.4s, v18.8h, v17.8h
-; VBITS_EQ_128-NEXT:    smull v30.4s, v18.4h, v17.4h
-; VBITS_EQ_128-NEXT:    ldp q22, q17, [x0, #128]
-; VBITS_EQ_128-NEXT:    shrn v9.4h, v9.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v30.4h, v30.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v9.8h, v31.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v30.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q19, q20, [x1, #128]
-; VBITS_EQ_128-NEXT:    smull2 v16.4s, v17.8h, v20.8h
-; VBITS_EQ_128-NEXT:    ldr q21, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    smull v18.4s, v17.4h, v20.4h
-; VBITS_EQ_128-NEXT:    ldp q24, q20, [x0, #96]
-; VBITS_EQ_128-NEXT:    smull2 v17.4s, v22.8h, v19.8h
-; VBITS_EQ_128-NEXT:    shrn2 v4.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT:    smull v19.4s, v22.4h, v19.4h
-; VBITS_EQ_128-NEXT:    shrn v2.4h, v18.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v2.8h, v16.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q22, q23, [x1, #96]
-; VBITS_EQ_128-NEXT:    shrn v3.4h, v19.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v3.8h, v17.4s, #16
-; VBITS_EQ_128-NEXT:    smull2 v12.4s, v24.8h, v22.8h
-; VBITS_EQ_128-NEXT:    smull v13.4s, v24.4h, v22.4h
-; VBITS_EQ_128-NEXT:    smull2 v10.4s, v20.8h, v23.8h
-; VBITS_EQ_128-NEXT:    ldr q21, [sp] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    smull v11.4s, v20.4h, v23.4h
-; VBITS_EQ_128-NEXT:    ldp q26, q23, [x0, #64]
-; VBITS_EQ_128-NEXT:    shrn2 v6.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q24, q25, [x1, #64]
-; VBITS_EQ_128-NEXT:    smull2 v22.4s, v26.8h, v24.8h
-; VBITS_EQ_128-NEXT:    smull v24.4s, v26.4h, v24.4h
-; VBITS_EQ_128-NEXT:    smull2 v20.4s, v23.8h, v25.8h
-; VBITS_EQ_128-NEXT:    smull v23.4s, v23.4h, v25.4h
-; VBITS_EQ_128-NEXT:    ldp q28, q25, [x0, #32]
-; VBITS_EQ_128-NEXT:    ldp q26, q27, [x1, #32]
-; VBITS_EQ_128-NEXT:    smull2 v15.4s, v28.8h, v26.8h
-; VBITS_EQ_128-NEXT:    smull v1.4s, v28.4h, v26.4h
-; VBITS_EQ_128-NEXT:    smull2 v14.4s, v25.8h, v27.8h
-; VBITS_EQ_128-NEXT:    smull v8.4s, v25.4h, v27.4h
-; VBITS_EQ_128-NEXT:    ldp q0, q27, [x0]
-; VBITS_EQ_128-NEXT:    shrn v8.4h, v8.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v8.8h, v14.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q28, q29, [x1]
-; VBITS_EQ_128-NEXT:    stp q3, q2, [x0, #128]
-; VBITS_EQ_128-NEXT:    shrn v2.4h, v23.4s, #16
-; VBITS_EQ_128-NEXT:    stp q9, q30, [x0, #160]
-; VBITS_EQ_128-NEXT:    shrn v3.4h, v24.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v2.8h, v20.4s, #16
-; VBITS_EQ_128-NEXT:    stp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT:    smull2 v26.4s, v0.8h, v28.8h
-; VBITS_EQ_128-NEXT:    shrn2 v3.8h, v22.4s, #16
-; VBITS_EQ_128-NEXT:    smull v28.4s, v0.4h, v28.4h
-; VBITS_EQ_128-NEXT:    stp q6, q4, [x0, #224]
-; VBITS_EQ_128-NEXT:    smull2 v25.4s, v27.8h, v29.8h
-; VBITS_EQ_128-NEXT:    stp q3, q2, [x0, #64]
-; VBITS_EQ_128-NEXT:    smull v27.4s, v27.4h, v29.4h
-; VBITS_EQ_128-NEXT:    shrn v29.4h, v1.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v0.4h, v13.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v1.4h, v11.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v29.8h, v15.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v0.8h, v12.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v1.8h, v10.4s, #16
-; VBITS_EQ_128-NEXT:    stp q29, q8, [x0, #32]
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    shrn v0.4h, v27.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v1.4h, v28.4s, #16
-; VBITS_EQ_128-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    shrn2 v0.8h, v25.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v1.8h, v26.4s, #16
-; VBITS_EQ_128-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    stp q1, q0, [x0]
-; VBITS_EQ_128-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add sp, sp, #96
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: smulh_v128i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: smulh_v128i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %1 = sext <128 x i16> %op1 to <128 x i32>
@@ -705,7 +341,7 @@ define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
-define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smulh_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
@@ -714,8 +350,6 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
 ; CHECK-NEXT:    ret
-
-
   %1 = sext <2 x i32> %op1 to <2 x i64>
   %2 = sext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
@@ -725,7 +359,7 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smulh_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    smull2 v2.2d, v0.4s, v1.4s
@@ -740,39 +374,15 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
   ret <4 x i32> %res
 }
 
-define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v8i32:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT:    ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT:    sshll v5.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT:    sshll v4.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v1.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v7.2d, v2.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v2.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v6.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT:    mul z5.d, p0/m, z5.d, z7.d
-; VBITS_EQ_128-NEXT:    sshll2 v3.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z2.d
-; VBITS_EQ_128-NEXT:    mul z4.d, p0/m, z4.d, z6.d
-; VBITS_EQ_128-NEXT:    shrn v5.2s, v5.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v2.2s, v4.2d, #32
-; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z3.d
-; VBITS_EQ_128-NEXT:    shrn2 v5.4s, v0.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v2.4s, v1.2d, #32
-; VBITS_EQ_128-NEXT:    stp q5, q2, [x0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: smulh_v8i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
+define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %1 = sext <8 x i32> %op1 to <8 x i64>
@@ -785,57 +395,47 @@ define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v16i32:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q1, q2, [x0, #32]
-; VBITS_EQ_128-NEXT:    ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT:    sshll v19.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v1.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q3, q4, [x0]
-; VBITS_EQ_128-NEXT:    sshll v18.2d, v2.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v2.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v7.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v3.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q5, q6, [x1, #32]
-; VBITS_EQ_128-NEXT:    sshll v0.2d, v4.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v4.2d, v4.4s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v21.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v5.2d, v5.2s, #0
-; VBITS_EQ_128-NEXT:    ldp q16, q17, [x1]
-; VBITS_EQ_128-NEXT:    sshll2 v22.2d, v6.4s, #0
-; VBITS_EQ_128-NEXT:    mul z5.d, p0/m, z5.d, z19.d
-; VBITS_EQ_128-NEXT:    sshll v6.2d, v6.2s, #0
-; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z21.d
-; VBITS_EQ_128-NEXT:    shrn v5.2s, v5.2d, #32
-; VBITS_EQ_128-NEXT:    mul z2.d, p0/m, z2.d, z22.d
-; VBITS_EQ_128-NEXT:    sshll v19.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT:    mul z6.d, p0/m, z6.d, z18.d
-; VBITS_EQ_128-NEXT:    sshll2 v16.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v20.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT:    mul z7.d, p0/m, z7.d, z19.d
-; VBITS_EQ_128-NEXT:    sshll2 v17.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT:    mul z3.d, p0/m, z3.d, z16.d
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z20.d
-; VBITS_EQ_128-NEXT:    shrn v6.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v7.2s, v7.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v0.2s, v0.2d, #32
-; VBITS_EQ_128-NEXT:    mul z4.d, p0/m, z4.d, z17.d
-; VBITS_EQ_128-NEXT:    shrn2 v5.4s, v1.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v6.4s, v2.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v7.4s, v3.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v0.4s, v4.2d, #32
-; VBITS_EQ_128-NEXT:    stp q5, q6, [x0, #32]
-; VBITS_EQ_128-NEXT:    stp q7, q0, [x0]
-; VBITS_EQ_128-NEXT:    ret
+; VBITS_GE_256-LABEL: smulh_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov z7.d, z1.d
+; VBITS_GE_256-NEXT:    mov z16.d, z3.d
+; VBITS_GE_256-NEXT:    ext z7.b, z7.b, z7.b, #16
+; VBITS_GE_256-NEXT:    smull2 v4.2d, v0.4s, v2.4s
+; VBITS_GE_256-NEXT:    ext z16.b, z16.b, z3.b, #16
+; VBITS_GE_256-NEXT:    smull v5.2d, v0.2s, v2.2s
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    smull2 v6.2d, v1.4s, v3.4s
+; VBITS_GE_256-NEXT:    smull v1.2d, v1.2s, v3.2s
+; VBITS_GE_256-NEXT:    smull2 v3.2d, v0.4s, v2.4s
+; VBITS_GE_256-NEXT:    smull v0.2d, v0.2s, v2.2s
+; VBITS_GE_256-NEXT:    smull2 v2.2d, v7.4s, v16.4s
+; VBITS_GE_256-NEXT:    smull v7.2d, v7.2s, v16.2s
+; VBITS_GE_256-NEXT:    uzp2 v4.4s, v5.4s, v4.4s
+; VBITS_GE_256-NEXT:    uzp2 v1.4s, v1.4s, v6.4s
+; VBITS_GE_256-NEXT:    uzp2 v0.4s, v0.4s, v3.4s
+; VBITS_GE_256-NEXT:    uzp2 v2.4s, v7.4s, v2.4s
+; VBITS_GE_256-NEXT:    splice z4.s, p1, z4.s, z0.s
+; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z2.s
+; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; VBITS_GE_1024-LABEL: smulh_v16i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+; VBITS_GE_512-LABEL: smulh_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %1 = sext <16 x i32> %op1 to <16 x i64>
@@ -847,104 +447,15 @@ define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v32i32:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_def_cfa_offset 32
-; VBITS_EQ_128-NEXT:    stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_offset b8, -8
-; VBITS_EQ_128-NEXT:    .cfi_offset b9, -16
-; VBITS_EQ_128-NEXT:    .cfi_offset b10, -32
-; VBITS_EQ_128-NEXT:    ldp q17, q16, [x0, #64]
-; VBITS_EQ_128-NEXT:    ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT:    sshll v27.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v29.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q23, q28, [x0, #96]
-; VBITS_EQ_128-NEXT:    sshll v19.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v22.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v31.2d, v23.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v8.2d, v23.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q26, q25, [x1, #96]
-; VBITS_EQ_128-NEXT:    sshll v30.2d, v28.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v28.2d, v28.4s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v9.2d, v26.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v26.2d, v26.2s, #0
-; VBITS_EQ_128-NEXT:    ldp q24, q21, [x1, #64]
-; VBITS_EQ_128-NEXT:    mul z26.d, p0/m, z26.d, z31.d
-; VBITS_EQ_128-NEXT:    mul z8.d, p0/m, z8.d, z9.d
-; VBITS_EQ_128-NEXT:    sshll2 v10.2d, v25.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v25.2d, v25.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v31.2d, v24.4s, #0
-; VBITS_EQ_128-NEXT:    mul z28.d, p0/m, z28.d, z10.d
-; VBITS_EQ_128-NEXT:    sshll v24.2d, v24.2s, #0
-; VBITS_EQ_128-NEXT:    mul z25.d, p0/m, z25.d, z30.d
-; VBITS_EQ_128-NEXT:    ldp q7, q5, [x0, #32]
-; VBITS_EQ_128-NEXT:    mul z24.d, p0/m, z24.d, z27.d
-; VBITS_EQ_128-NEXT:    mul z29.d, p0/m, z29.d, z31.d
-; VBITS_EQ_128-NEXT:    sshll2 v30.2d, v21.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v21.2d, v21.2s, #0
-; VBITS_EQ_128-NEXT:    sshll v6.2d, v7.2s, #0
-; VBITS_EQ_128-NEXT:    mul z22.d, p0/m, z22.d, z30.d
-; VBITS_EQ_128-NEXT:    mul z19.d, p0/m, z19.d, z21.d
-; VBITS_EQ_128-NEXT:    ldp q20, q18, [x1, #32]
-; VBITS_EQ_128-NEXT:    sshll v4.2d, v5.2s, #0
-; VBITS_EQ_128-NEXT:    shrn v19.2s, v19.2d, #32
-; VBITS_EQ_128-NEXT:    sshll2 v5.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v7.2d, v7.4s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v27.2d, v20.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v20.2d, v20.2s, #0
-; VBITS_EQ_128-NEXT:    ldp q3, q1, [x0]
-; VBITS_EQ_128-NEXT:    mul z6.d, p0/m, z6.d, z20.d
-; VBITS_EQ_128-NEXT:    mul z7.d, p0/m, z7.d, z27.d
-; VBITS_EQ_128-NEXT:    sshll2 v21.2d, v18.4s, #0
-; VBITS_EQ_128-NEXT:    shrn v6.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT:    sshll v18.2d, v18.2s, #0
-; VBITS_EQ_128-NEXT:    sshll v2.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT:    mul z5.d, p0/m, z5.d, z21.d
-; VBITS_EQ_128-NEXT:    sshll2 v3.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT:    mul z4.d, p0/m, z4.d, z18.d
-; VBITS_EQ_128-NEXT:    ldp q16, q17, [x1]
-; VBITS_EQ_128-NEXT:    sshll v0.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT:    shrn v4.2s, v4.2d, #32
-; VBITS_EQ_128-NEXT:    sshll2 v1.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT:    shrn v18.2s, v24.2d, #32
-; VBITS_EQ_128-NEXT:    sshll v20.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT:    shrn2 v19.4s, v22.2d, #32
-; VBITS_EQ_128-NEXT:    sshll2 v16.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v23.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT:    mul z2.d, p0/m, z2.d, z20.d
-; VBITS_EQ_128-NEXT:    sshll2 v17.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT:    mul z3.d, p0/m, z3.d, z16.d
-; VBITS_EQ_128-NEXT:    shrn v16.2s, v26.2d, #32
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z23.d
-; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z17.d
-; VBITS_EQ_128-NEXT:    shrn v0.2s, v0.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v2.2s, v2.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v17.2s, v25.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v16.4s, v8.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v18.4s, v29.2d, #32
-; VBITS_EQ_128-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    shrn2 v17.4s, v28.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v6.4s, v7.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v4.4s, v5.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v2.4s, v3.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v0.4s, v1.2d, #32
-; VBITS_EQ_128-NEXT:    stp q18, q19, [x0, #64]
-; VBITS_EQ_128-NEXT:    stp q6, q4, [x0, #32]
-; VBITS_EQ_128-NEXT:    stp q2, q0, [x0]
-; VBITS_EQ_128-NEXT:    stp q16, q17, [x0, #96]
-; VBITS_EQ_128-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: smulh_v32i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: smulh_v32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %1 = sext <32 x i32> %op1 to <32 x i64>
@@ -956,276 +467,15 @@ define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v64i32:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_def_cfa_offset 80
-; VBITS_EQ_128-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_offset w29, -16
-; VBITS_EQ_128-NEXT:    .cfi_offset b8, -24
-; VBITS_EQ_128-NEXT:    .cfi_offset b9, -32
-; VBITS_EQ_128-NEXT:    .cfi_offset b10, -40
-; VBITS_EQ_128-NEXT:    .cfi_offset b11, -48
-; VBITS_EQ_128-NEXT:    .cfi_offset b12, -56
-; VBITS_EQ_128-NEXT:    .cfi_offset b13, -64
-; VBITS_EQ_128-NEXT:    .cfi_offset b14, -72
-; VBITS_EQ_128-NEXT:    .cfi_offset b15, -80
-; VBITS_EQ_128-NEXT:    addvl sp, sp, #-12
-; VBITS_EQ_128-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 96 * VG
-; VBITS_EQ_128-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xa0, 0x01, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 160 + 96 * VG
-; VBITS_EQ_128-NEXT:    ldp q4, q5, [x0, #96]
-; VBITS_EQ_128-NEXT:    ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT:    stp q5, q4, [sp, #-80]! // 32-byte Folded Spill
-; VBITS_EQ_128-NEXT:    ldp q0, q2, [x0, #48]
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ldr q1, [x0, #32]
-; VBITS_EQ_128-NEXT:    ldr q3, [x0, #80]
-; VBITS_EQ_128-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    sshll v1.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT:    stp q3, q2, [sp, #32] // 32-byte Folded Spill
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT:    str z1, [x8, #11, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    str z0, [x8, #10, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT:    str z0, [x8, #9, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q23, q26, [x0, #128]
-; VBITS_EQ_128-NEXT:    str z0, [x8, #8, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v4.4s, #0
-; VBITS_EQ_128-NEXT:    str z0, [x8, #7, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q25, q24, [x0, #160]
-; VBITS_EQ_128-NEXT:    str z0, [x8, #6, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v23.4s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v1.2d, v26.4s, #0
-; VBITS_EQ_128-NEXT:    str z0, [x8, #5, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll2 v27.2d, v25.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q30, q0, [x0, #192]
-; VBITS_EQ_128-NEXT:    str z1, [x8, #4, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll2 v9.2d, v24.4s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v12.2d, v30.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q31, q1, [x0, #224]
-; VBITS_EQ_128-NEXT:    sshll v11.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v8.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v10.2d, v31.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v15.2d, v31.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q29, q28, [x1, #224]
-; VBITS_EQ_128-NEXT:    sshll2 v18.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v31.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v2.2d, v29.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q14, q0, [x1, #192]
-; VBITS_EQ_128-NEXT:    sshll v1.2d, v28.2s, #0
-; VBITS_EQ_128-NEXT:    sshll v20.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v19.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v28.4s, #0
-; VBITS_EQ_128-NEXT:    mul z11.d, p0/m, z11.d, z20.d
-; VBITS_EQ_128-NEXT:    ldp q21, q22, [x0]
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z18.d
-; VBITS_EQ_128-NEXT:    sshll v18.2d, v29.2s, #0
-; VBITS_EQ_128-NEXT:    sshll v20.2d, v14.2s, #0
-; VBITS_EQ_128-NEXT:    ldp q4, q13, [x1, #160]
-; VBITS_EQ_128-NEXT:    ldp q5, q6, [x1, #128]
-; VBITS_EQ_128-NEXT:    ldp q7, q3, [x1, #96]
-; VBITS_EQ_128-NEXT:    str z0, [x8, #3, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ldp q17, q16, [x1, #64]
-; VBITS_EQ_128-NEXT:    movprfx z0, z31
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128-NEXT:    str z0, [x8, #1, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    movprfx z0, z15
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z2.d
-; VBITS_EQ_128-NEXT:    sshll v1.2d, v30.2s, #0
-; VBITS_EQ_128-NEXT:    str z0, [x8, #2, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ldp q2, q29, [x1, #32]
-; VBITS_EQ_128-NEXT:    movprfx z15, z10
-; VBITS_EQ_128-NEXT:    mul z15.d, p0/m, z15.d, z18.d
-; VBITS_EQ_128-NEXT:    movprfx z0, z8
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z19.d
-; VBITS_EQ_128-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v14.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q19, q18, [x1]
-; VBITS_EQ_128-NEXT:    movprfx z10, z12
-; VBITS_EQ_128-NEXT:    mul z10.d, p0/m, z10.d, z0.d
-; VBITS_EQ_128-NEXT:    movprfx z8, z1
-; VBITS_EQ_128-NEXT:    mul z8.d, p0/m, z8.d, z20.d
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v13.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v12.2d, v24.2s, #0
-; VBITS_EQ_128-NEXT:    sshll v1.2d, v13.2s, #0
-; VBITS_EQ_128-NEXT:    mul z9.d, p0/m, z9.d, z0.d
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v4.4s, #0
-; VBITS_EQ_128-NEXT:    mul z12.d, p0/m, z12.d, z1.d
-; VBITS_EQ_128-NEXT:    sshll v1.2d, v4.2s, #0
-; VBITS_EQ_128-NEXT:    mul z27.d, p0/m, z27.d, z0.d
-; VBITS_EQ_128-NEXT:    sshll v20.2d, v25.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z13, z20
-; VBITS_EQ_128-NEXT:    mul z13.d, p0/m, z13.d, z1.d
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v6.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v1.2d, v6.2s, #0
-; VBITS_EQ_128-NEXT:    ldr z6, [x8, #4, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    movprfx z14, z6
-; VBITS_EQ_128-NEXT:    mul z14.d, p0/m, z14.d, z0.d
-; VBITS_EQ_128-NEXT:    sshll v4.2d, v26.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z30, z4
-; VBITS_EQ_128-NEXT:    mul z30.d, p0/m, z30.d, z1.d
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT:    ldr z4, [x8, #5, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll v1.2d, v5.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z31, z4
-; VBITS_EQ_128-NEXT:    mul z31.d, p0/m, z31.d, z0.d
-; VBITS_EQ_128-NEXT:    sshll v6.2d, v23.2s, #0
-; VBITS_EQ_128-NEXT:    ldr q4, [sp] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT:    movprfx z28, z6
-; VBITS_EQ_128-NEXT:    mul z28.d, p0/m, z28.d, z1.d
-; VBITS_EQ_128-NEXT:    sshll v1.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT:    ldr z3, [x8, #6, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    movprfx z23, z3
-; VBITS_EQ_128-NEXT:    mul z23.d, p0/m, z23.d, z0.d
-; VBITS_EQ_128-NEXT:    sshll v5.2d, v4.2s, #0
-; VBITS_EQ_128-NEXT:    ldr q3, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    movprfx z20, z5
-; VBITS_EQ_128-NEXT:    mul z20.d, p0/m, z20.d, z1.d
-; VBITS_EQ_128-NEXT:    ldr z1, [x8, #7, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v7.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v4.2d, v7.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z7, z1
-; VBITS_EQ_128-NEXT:    mul z7.d, p0/m, z7.d, z0.d
-; VBITS_EQ_128-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    sshll v3.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z6, z3
-; VBITS_EQ_128-NEXT:    mul z6.d, p0/m, z6.d, z4.d
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v5.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT:    ldr z1, [x8, #8, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    movprfx z26, z1
-; VBITS_EQ_128-NEXT:    mul z26.d, p0/m, z26.d, z0.d
-; VBITS_EQ_128-NEXT:    ldr q1, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll v3.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z24, z5
-; VBITS_EQ_128-NEXT:    mul z24.d, p0/m, z24.d, z3.d
-; VBITS_EQ_128-NEXT:    sshll v16.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT:    ldr z1, [x8, #9, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT:    movprfx z25, z1
-; VBITS_EQ_128-NEXT:    mul z25.d, p0/m, z25.d, z0.d
-; VBITS_EQ_128-NEXT:    sshll v5.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v29.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v17.2d, v29.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z29, z16
-; VBITS_EQ_128-NEXT:    mul z29.d, p0/m, z29.d, z5.d
-; VBITS_EQ_128-NEXT:    ldr z1, [x8, #10, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    movprfx z4, z1
-; VBITS_EQ_128-NEXT:    mul z4.d, p0/m, z4.d, z0.d
-; VBITS_EQ_128-NEXT:    sshll v5.2d, v22.2s, #0
-; VBITS_EQ_128-NEXT:    ldr z0, [x8, #11, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    sshll2 v16.2d, v22.4s, #0
-; VBITS_EQ_128-NEXT:    movprfx z22, z0
-; VBITS_EQ_128-NEXT:    mul z22.d, p0/m, z22.d, z17.d
-; VBITS_EQ_128-NEXT:    ldr q0, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    sshll v1.2d, v2.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v2.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v17.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v0.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v3.2d, v18.2s, #0
-; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z17.d
-; VBITS_EQ_128-NEXT:    sshll2 v18.2d, v18.4s, #0
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z2.d
-; VBITS_EQ_128-NEXT:    movprfx z2, z5
-; VBITS_EQ_128-NEXT:    mul z2.d, p0/m, z2.d, z3.d
-; VBITS_EQ_128-NEXT:    mul z18.d, p0/m, z18.d, z16.d
-; VBITS_EQ_128-NEXT:    sshll2 v5.2d, v21.4s, #0
-; VBITS_EQ_128-NEXT:    sshll2 v16.2d, v19.4s, #0
-; VBITS_EQ_128-NEXT:    sshll v17.2d, v19.2s, #0
-; VBITS_EQ_128-NEXT:    mul z5.d, p0/m, z5.d, z16.d
-; VBITS_EQ_128-NEXT:    shrn v16.2s, v1.2d, #32
-; VBITS_EQ_128-NEXT:    sshll v3.2d, v21.2s, #0
-; VBITS_EQ_128-NEXT:    shrn v21.2s, v22.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v16.4s, v0.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v0.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT:    ldr z6, [x8, #1, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    shrn v1.2s, v20.2d, #32
-; VBITS_EQ_128-NEXT:    mul z17.d, p0/m, z17.d, z3.d
-; VBITS_EQ_128-NEXT:    shrn2 v21.4s, v4.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v0.4s, v7.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v3.2s, v13.2d, #32
-; VBITS_EQ_128-NEXT:    ldr z19, [x8, #3, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    shrn v4.2s, v12.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v6.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v7.2s, v15.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v1.4s, v23.2d, #32
-; VBITS_EQ_128-NEXT:    ldr z20, [x8, #2, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    shrn2 v3.4s, v27.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v4.4s, v9.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v6.4s, v19.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v19.2s, v11.2d, #32
-; VBITS_EQ_128-NEXT:    ldr z22, [x8] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    stp q16, q21, [x0, #32]
-; VBITS_EQ_128-NEXT:    shrn2 v7.4s, v20.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v20.2s, v8.2d, #32
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    shrn v0.2s, v2.2d, #32
-; VBITS_EQ_128-NEXT:    stp q3, q4, [x0, #160]
-; VBITS_EQ_128-NEXT:    shrn v3.2s, v24.2d, #32
-; VBITS_EQ_128-NEXT:    stp q7, q6, [x0, #224]
-; VBITS_EQ_128-NEXT:    shrn v6.2s, v30.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v7.2s, v28.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v4.2s, v29.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v1.2s, v17.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v19.4s, v22.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v20.4s, v10.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v6.4s, v14.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v7.4s, v31.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v3.4s, v26.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v4.4s, v25.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v0.4s, v18.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v1.4s, v5.2d, #32
-; VBITS_EQ_128-NEXT:    stp q7, q6, [x0, #128]
-; VBITS_EQ_128-NEXT:    stp q4, q3, [x0, #64]
-; VBITS_EQ_128-NEXT:    stp q1, q0, [x0]
-; VBITS_EQ_128-NEXT:    stp q20, q19, [x0, #192]
-; VBITS_EQ_128-NEXT:    addvl sp, sp, #12
-; VBITS_EQ_128-NEXT:    add sp, sp, #80
-; VBITS_EQ_128-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: smulh_v64i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: smulh_v64i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %1 = sext <64 x i32> %op1 to <64 x i64>
@@ -1238,25 +488,15 @@ define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
-define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
-; VBITS_EQ_128-LABEL: smulh_v1i64:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    // kill: def $d1 killed $d1 def $q1
-; VBITS_EQ_128-NEXT:    // kill: def $d0 killed $d0 def $q0
-; VBITS_EQ_128-NEXT:    fmov x8, d0
-; VBITS_EQ_128-NEXT:    fmov x9, d1
-; VBITS_EQ_128-NEXT:    smulh x8, x8, x9
-; VBITS_EQ_128-NEXT:    fmov d0, x8
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: smulh_v1i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl1
-; VBITS_GE_256-NEXT:    // kill: def $d1 killed $d1 def $z1
-; VBITS_GE_256-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; VBITS_GE_256-NEXT:    ret
+define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %insert = insertelement <1 x i128> undef, i128 64, i128 0
   %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
   %1 = sext <1 x i64> %op1 to <1 x i128>
@@ -1268,28 +508,15 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 }
 
 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
-define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
-; VBITS_EQ_128-LABEL: smulh_v2i64:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    mov x8, v0.d[1]
-; VBITS_EQ_128-NEXT:    fmov x10, d0
-; VBITS_EQ_128-NEXT:    mov x9, v1.d[1]
-; VBITS_EQ_128-NEXT:    fmov x11, d1
-; VBITS_EQ_128-NEXT:    smulh x10, x10, x11
-; VBITS_EQ_128-NEXT:    smulh x8, x8, x9
-; VBITS_EQ_128-NEXT:    fmov d0, x10
-; VBITS_EQ_128-NEXT:    fmov d1, x8
-; VBITS_EQ_128-NEXT:    mov v0.d[1], v1.d[0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: smulh_v2i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl2
-; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_256-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
-; VBITS_GE_256-NEXT:    ret
+define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %1 = sext <2 x i64> %op1 to <2 x i128>
   %2 = sext <2 x i64> %op2 to <2 x i128>
   %mul = mul <2 x i128> %1, %2
@@ -1298,40 +525,15 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
   ret <2 x i64> %res
 }
 
-define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v4i64:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT:    mov x10, v0.d[1]
-; VBITS_EQ_128-NEXT:    fmov x11, d0
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT:    mov x8, v1.d[1]
-; VBITS_EQ_128-NEXT:    fmov x9, d1
-; VBITS_EQ_128-NEXT:    mov x12, v2.d[1]
-; VBITS_EQ_128-NEXT:    fmov x13, d2
-; VBITS_EQ_128-NEXT:    mov x14, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x15, d3
-; VBITS_EQ_128-NEXT:    smulh x11, x11, x13
-; VBITS_EQ_128-NEXT:    smulh x10, x10, x12
-; VBITS_EQ_128-NEXT:    smulh x9, x9, x15
-; VBITS_EQ_128-NEXT:    smulh x8, x8, x14
-; VBITS_EQ_128-NEXT:    fmov d0, x11
-; VBITS_EQ_128-NEXT:    fmov d1, x10
-; VBITS_EQ_128-NEXT:    fmov d2, x9
-; VBITS_EQ_128-NEXT:    fmov d3, x8
-; VBITS_EQ_128-NEXT:    mov v0.d[1], v1.d[0]
-; VBITS_EQ_128-NEXT:    mov v2.d[1], v3.d[0]
-; VBITS_EQ_128-NEXT:    stp q0, q2, [x0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: smulh_v4i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
+define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: smulh_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %1 = sext <4 x i64> %op1 to <4 x i128>
@@ -1344,60 +546,69 @@ define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @smulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v8i64:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT:    fmov x14, d0
-; VBITS_EQ_128-NEXT:    mov x13, v0.d[1]
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x0]
-; VBITS_EQ_128-NEXT:    mov x11, v1.d[1]
-; VBITS_EQ_128-NEXT:    fmov x12, d1
-; VBITS_EQ_128-NEXT:    mov x10, v2.d[1]
-; VBITS_EQ_128-NEXT:    ldp q4, q5, [x1, #32]
-; VBITS_EQ_128-NEXT:    mov x8, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x9, d3
-; VBITS_EQ_128-NEXT:    fmov x17, d4
-; VBITS_EQ_128-NEXT:    mov x15, v4.d[1]
-; VBITS_EQ_128-NEXT:    ldp q3, q1, [x1]
-; VBITS_EQ_128-NEXT:    fmov x1, d5
-; VBITS_EQ_128-NEXT:    smulh x14, x14, x17
-; VBITS_EQ_128-NEXT:    mov x18, v5.d[1]
-; VBITS_EQ_128-NEXT:    smulh x13, x13, x15
-; VBITS_EQ_128-NEXT:    fmov x15, d2
-; VBITS_EQ_128-NEXT:    smulh x12, x12, x1
-; VBITS_EQ_128-NEXT:    mov x1, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x17, d1
-; VBITS_EQ_128-NEXT:    smulh x11, x11, x18
-; VBITS_EQ_128-NEXT:    mov x16, v1.d[1]
-; VBITS_EQ_128-NEXT:    fmov d2, x13
-; VBITS_EQ_128-NEXT:    fmov d5, x12
-; VBITS_EQ_128-NEXT:    smulh x9, x9, x17
-; VBITS_EQ_128-NEXT:    fmov x17, d3
-; VBITS_EQ_128-NEXT:    smulh x10, x10, x1
-; VBITS_EQ_128-NEXT:    fmov d3, x14
-; VBITS_EQ_128-NEXT:    smulh x8, x8, x16
-; VBITS_EQ_128-NEXT:    fmov d4, x11
-; VBITS_EQ_128-NEXT:    smulh x15, x15, x17
-; VBITS_EQ_128-NEXT:    fmov d1, x9
-; VBITS_EQ_128-NEXT:    fmov d6, x10
-; VBITS_EQ_128-NEXT:    fmov d0, x8
-; VBITS_EQ_128-NEXT:    fmov d7, x15
-; VBITS_EQ_128-NEXT:    mov v3.d[1], v2.d[0]
-; VBITS_EQ_128-NEXT:    mov v5.d[1], v4.d[0]
-; VBITS_EQ_128-NEXT:    mov v7.d[1], v6.d[0]
-; VBITS_EQ_128-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_EQ_128-NEXT:    stp q3, q5, [x0, #32]
-; VBITS_EQ_128-NEXT:    stp q7, q1, [x0]
-; VBITS_EQ_128-NEXT:    ret
+; VBITS_GE_256-LABEL: smulh_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ptrue p1.d, vl2
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov x9, v0.d[1]
+; VBITS_GE_256-NEXT:    fmov x10, d0
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    fmov x17, d2
+; VBITS_GE_256-NEXT:    mov x13, v2.d[1]
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    mov x14, v0.d[1]
+; VBITS_GE_256-NEXT:    mov x18, v2.d[1]
+; VBITS_GE_256-NEXT:    smulh x10, x10, x17
+; VBITS_GE_256-NEXT:    mov x11, v1.d[1]
+; VBITS_GE_256-NEXT:    fmov x12, d1
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    mov x2, v3.d[1]
+; VBITS_GE_256-NEXT:    fmov x3, d3
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
+; VBITS_GE_256-NEXT:    smulh x9, x9, x13
+; VBITS_GE_256-NEXT:    mov x13, v1.d[1]
+; VBITS_GE_256-NEXT:    smulh x14, x14, x18
+; VBITS_GE_256-NEXT:    mov x18, v3.d[1]
+; VBITS_GE_256-NEXT:    smulh x12, x12, x3
+; VBITS_GE_256-NEXT:    fmov x15, d0
+; VBITS_GE_256-NEXT:    fmov x16, d1
+; VBITS_GE_256-NEXT:    fmov x1, d2
+; VBITS_GE_256-NEXT:    fmov x17, d3
+; VBITS_GE_256-NEXT:    fmov d0, x9
+; VBITS_GE_256-NEXT:    fmov d1, x10
+; VBITS_GE_256-NEXT:    smulh x9, x11, x2
+; VBITS_GE_256-NEXT:    smulh x15, x15, x1
+; VBITS_GE_256-NEXT:    fmov d4, x12
+; VBITS_GE_256-NEXT:    smulh x16, x16, x17
+; VBITS_GE_256-NEXT:    smulh x10, x13, x18
+; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    fmov d0, x14
+; VBITS_GE_256-NEXT:    fmov d2, x15
+; VBITS_GE_256-NEXT:    fmov d3, x9
+; VBITS_GE_256-NEXT:    fmov d6, x16
+; VBITS_GE_256-NEXT:    fmov d5, x10
+; VBITS_GE_256-NEXT:    mov v2.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    mov v4.d[1], v3.d[0]
+; VBITS_GE_256-NEXT:    mov v6.d[1], v5.d[0]
+; VBITS_GE_256-NEXT:    splice z1.d, p1, z1.d, z2.d
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    splice z4.d, p1, z4.d, z6.d
+; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; VBITS_GE_1024-LABEL: smulh_v8i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+; VBITS_GE_512-LABEL: smulh_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %1 = sext <8 x i64> %op1 to <8 x i128>
@@ -1409,111 +620,15 @@ define void @smulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v16i64:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    str x21, [sp, #-32]! // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_def_cfa_offset 32
-; VBITS_EQ_128-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_offset w19, -8
-; VBITS_EQ_128-NEXT:    .cfi_offset w20, -16
-; VBITS_EQ_128-NEXT:    .cfi_offset w21, -32
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x0]
-; VBITS_EQ_128-NEXT:    mov x10, v2.d[1]
-; VBITS_EQ_128-NEXT:    fmov x11, d2
-; VBITS_EQ_128-NEXT:    ldp q4, q5, [x0, #32]
-; VBITS_EQ_128-NEXT:    mov x8, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x9, d3
-; VBITS_EQ_128-NEXT:    mov x14, v4.d[1]
-; VBITS_EQ_128-NEXT:    fmov x15, d4
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    mov x12, v5.d[1]
-; VBITS_EQ_128-NEXT:    fmov x13, d5
-; VBITS_EQ_128-NEXT:    fmov x5, d0
-; VBITS_EQ_128-NEXT:    mov x4, v0.d[1]
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x0, #64]
-; VBITS_EQ_128-NEXT:    mov x3, v1.d[1]
-; VBITS_EQ_128-NEXT:    mov x18, v2.d[1]
-; VBITS_EQ_128-NEXT:    fmov x2, d2
-; VBITS_EQ_128-NEXT:    ldp q5, q6, [x1, #96]
-; VBITS_EQ_128-NEXT:    mov x16, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x17, d3
-; VBITS_EQ_128-NEXT:    fmov x19, d5
-; VBITS_EQ_128-NEXT:    mov x6, v5.d[1]
-; VBITS_EQ_128-NEXT:    ldp q4, q7, [x1, #64]
-; VBITS_EQ_128-NEXT:    mov x20, v6.d[1]
-; VBITS_EQ_128-NEXT:    fmov x21, d6
-; VBITS_EQ_128-NEXT:    smulh x5, x5, x19
-; VBITS_EQ_128-NEXT:    smulh x4, x4, x6
-; VBITS_EQ_128-NEXT:    mov x19, v4.d[1]
-; VBITS_EQ_128-NEXT:    fmov x6, d4
-; VBITS_EQ_128-NEXT:    smulh x3, x3, x20
-; VBITS_EQ_128-NEXT:    ldp q3, q16, [x1, #32]
-; VBITS_EQ_128-NEXT:    fmov x20, d7
-; VBITS_EQ_128-NEXT:    smulh x2, x2, x6
-; VBITS_EQ_128-NEXT:    smulh x18, x18, x19
-; VBITS_EQ_128-NEXT:    fmov d18, x4
-; VBITS_EQ_128-NEXT:    fmov d19, x5
-; VBITS_EQ_128-NEXT:    fmov d20, x3
-; VBITS_EQ_128-NEXT:    smulh x17, x17, x20
-; VBITS_EQ_128-NEXT:    fmov x19, d3
-; VBITS_EQ_128-NEXT:    fmov d23, x2
-; VBITS_EQ_128-NEXT:    ldp q2, q17, [x1]
-; VBITS_EQ_128-NEXT:    fmov x1, d1
-; VBITS_EQ_128-NEXT:    fmov x20, d16
-; VBITS_EQ_128-NEXT:    smulh x15, x15, x19
-; VBITS_EQ_128-NEXT:    fmov d22, x18
-; VBITS_EQ_128-NEXT:    mov v19.d[1], v18.d[0]
-; VBITS_EQ_128-NEXT:    smulh x1, x1, x21
-; VBITS_EQ_128-NEXT:    mov x21, v7.d[1]
-; VBITS_EQ_128-NEXT:    smulh x13, x13, x20
-; VBITS_EQ_128-NEXT:    mov x7, v17.d[1]
-; VBITS_EQ_128-NEXT:    mov x6, v2.d[1]
-; VBITS_EQ_128-NEXT:    mov x20, v16.d[1]
-; VBITS_EQ_128-NEXT:    smulh x16, x16, x21
-; VBITS_EQ_128-NEXT:    fmov x21, d2
-; VBITS_EQ_128-NEXT:    fmov x19, d17
-; VBITS_EQ_128-NEXT:    smulh x8, x8, x7
-; VBITS_EQ_128-NEXT:    smulh x10, x10, x6
-; VBITS_EQ_128-NEXT:    fmov d5, x13
-; VBITS_EQ_128-NEXT:    smulh x11, x11, x21
-; VBITS_EQ_128-NEXT:    fmov d7, x15
-; VBITS_EQ_128-NEXT:    mov x21, v3.d[1]
-; VBITS_EQ_128-NEXT:    smulh x9, x9, x19
-; VBITS_EQ_128-NEXT:    smulh x12, x12, x20
-; VBITS_EQ_128-NEXT:    fmov d0, x8
-; VBITS_EQ_128-NEXT:    fmov d2, x10
-; VBITS_EQ_128-NEXT:    fmov d16, x16
-; VBITS_EQ_128-NEXT:    fmov d3, x11
-; VBITS_EQ_128-NEXT:    fmov d17, x17
-; VBITS_EQ_128-NEXT:    smulh x14, x14, x21
-; VBITS_EQ_128-NEXT:    fmov d1, x9
-; VBITS_EQ_128-NEXT:    fmov d4, x12
-; VBITS_EQ_128-NEXT:    fmov d21, x1
-; VBITS_EQ_128-NEXT:    mov v23.d[1], v22.d[0]
-; VBITS_EQ_128-NEXT:    mov v17.d[1], v16.d[0]
-; VBITS_EQ_128-NEXT:    fmov d6, x14
-; VBITS_EQ_128-NEXT:    mov v21.d[1], v20.d[0]
-; VBITS_EQ_128-NEXT:    mov v5.d[1], v4.d[0]
-; VBITS_EQ_128-NEXT:    mov v7.d[1], v6.d[0]
-; VBITS_EQ_128-NEXT:    stp q23, q17, [x0, #64]
-; VBITS_EQ_128-NEXT:    mov v3.d[1], v2.d[0]
-; VBITS_EQ_128-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_EQ_128-NEXT:    stp q19, q21, [x0, #96]
-; VBITS_EQ_128-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    stp q7, q5, [x0, #32]
-; VBITS_EQ_128-NEXT:    stp q3, q1, [x0]
-; VBITS_EQ_128-NEXT:    ldr x21, [sp], #32 // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: smulh_v16i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: smulh_v16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %1 = sext <16 x i64> %op1 to <16 x i128>
@@ -1525,237 +640,15 @@ define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: smulh_v32i64:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    sub sp, sp, #224
-; VBITS_EQ_128-NEXT:    .cfi_def_cfa_offset 224
-; VBITS_EQ_128-NEXT:    stp d15, d14, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d13, d12, [sp, #80] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d11, d10, [sp, #96] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d9, d8, [sp, #112] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp x29, x30, [sp, #128] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp x28, x27, [sp, #144] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp x26, x25, [sp, #160] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp x24, x23, [sp, #176] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp x22, x21, [sp, #192] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp x20, x19, [sp, #208] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_offset w19, -8
-; VBITS_EQ_128-NEXT:    .cfi_offset w20, -16
-; VBITS_EQ_128-NEXT:    .cfi_offset w21, -24
-; VBITS_EQ_128-NEXT:    .cfi_offset w22, -32
-; VBITS_EQ_128-NEXT:    .cfi_offset w23, -40
-; VBITS_EQ_128-NEXT:    .cfi_offset w24, -48
-; VBITS_EQ_128-NEXT:    .cfi_offset w25, -56
-; VBITS_EQ_128-NEXT:    .cfi_offset w26, -64
-; VBITS_EQ_128-NEXT:    .cfi_offset w27, -72
-; VBITS_EQ_128-NEXT:    .cfi_offset w28, -80
-; VBITS_EQ_128-NEXT:    .cfi_offset w30, -88
-; VBITS_EQ_128-NEXT:    .cfi_offset w29, -96
-; VBITS_EQ_128-NEXT:    .cfi_offset b8, -104
-; VBITS_EQ_128-NEXT:    .cfi_offset b9, -112
-; VBITS_EQ_128-NEXT:    .cfi_offset b10, -120
-; VBITS_EQ_128-NEXT:    .cfi_offset b11, -128
-; VBITS_EQ_128-NEXT:    .cfi_offset b12, -136
-; VBITS_EQ_128-NEXT:    .cfi_offset b13, -144
-; VBITS_EQ_128-NEXT:    .cfi_offset b14, -152
-; VBITS_EQ_128-NEXT:    .cfi_offset b15, -160
-; VBITS_EQ_128-NEXT:    ldp q3, q2, [x0]
-; VBITS_EQ_128-NEXT:    mov x8, v3.d[1]
-; VBITS_EQ_128-NEXT:    ldp q5, q4, [x0, #64]
-; VBITS_EQ_128-NEXT:    fmov x2, d2
-; VBITS_EQ_128-NEXT:    str x8, [sp, #16] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    fmov x8, d3
-; VBITS_EQ_128-NEXT:    mov x6, v5.d[1]
-; VBITS_EQ_128-NEXT:    fmov x7, d5
-; VBITS_EQ_128-NEXT:    str x8, [sp] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    ldp q6, q3, [x0, #96]
-; VBITS_EQ_128-NEXT:    mov x20, v4.d[1]
-; VBITS_EQ_128-NEXT:    fmov x21, d4
-; VBITS_EQ_128-NEXT:    mov x23, v6.d[1]
-; VBITS_EQ_128-NEXT:    fmov x24, d6
-; VBITS_EQ_128-NEXT:    ldp q16, q4, [x0, #128]
-; VBITS_EQ_128-NEXT:    mov x26, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x27, d3
-; VBITS_EQ_128-NEXT:    mov x28, v16.d[1]
-; VBITS_EQ_128-NEXT:    fmov x25, d16
-; VBITS_EQ_128-NEXT:    ldp q7, q5, [x0, #224]
-; VBITS_EQ_128-NEXT:    mov x22, v4.d[1]
-; VBITS_EQ_128-NEXT:    fmov x19, d4
-; VBITS_EQ_128-NEXT:    mov x13, v7.d[1]
-; VBITS_EQ_128-NEXT:    fmov x11, d7
-; VBITS_EQ_128-NEXT:    ldp q17, q6, [x0, #192]
-; VBITS_EQ_128-NEXT:    mov x12, v5.d[1]
-; VBITS_EQ_128-NEXT:    fmov x10, d5
-; VBITS_EQ_128-NEXT:    mov x17, v17.d[1]
-; VBITS_EQ_128-NEXT:    fmov x16, d17
-; VBITS_EQ_128-NEXT:    ldp q18, q3, [x0, #160]
-; VBITS_EQ_128-NEXT:    mov x15, v6.d[1]
-; VBITS_EQ_128-NEXT:    fmov x14, d6
-; VBITS_EQ_128-NEXT:    mov x5, v18.d[1]
-; VBITS_EQ_128-NEXT:    fmov x4, d18
-; VBITS_EQ_128-NEXT:    ldp q19, q16, [x1, #224]
-; VBITS_EQ_128-NEXT:    mov x29, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x18, d3
-; VBITS_EQ_128-NEXT:    fmov x8, d19
-; VBITS_EQ_128-NEXT:    mov x9, v19.d[1]
-; VBITS_EQ_128-NEXT:    ldp q21, q20, [x1, #192]
-; VBITS_EQ_128-NEXT:    mov x30, v16.d[1]
-; VBITS_EQ_128-NEXT:    smulh x8, x11, x8
-; VBITS_EQ_128-NEXT:    smulh x11, x13, x9
-; VBITS_EQ_128-NEXT:    fmov x9, d21
-; VBITS_EQ_128-NEXT:    str x8, [sp, #48] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    ldp q22, q18, [x1, #160]
-; VBITS_EQ_128-NEXT:    ldp q24, q23, [x1, #128]
-; VBITS_EQ_128-NEXT:    ldp q25, q17, [x1, #96]
-; VBITS_EQ_128-NEXT:    ldp q26, q6, [x1, #64]
-; VBITS_EQ_128-NEXT:    ldp q4, q3, [x1, #32]
-; VBITS_EQ_128-NEXT:    ldp q7, q5, [x1]
-; VBITS_EQ_128-NEXT:    fmov x1, d16
-; VBITS_EQ_128-NEXT:    smulh x10, x10, x1
-; VBITS_EQ_128-NEXT:    mov x1, v20.d[1]
-; VBITS_EQ_128-NEXT:    ldp q1, q0, [x0, #32]
-; VBITS_EQ_128-NEXT:    str x10, [sp, #56] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    smulh x10, x12, x30
-; VBITS_EQ_128-NEXT:    mov x30, v21.d[1]
-; VBITS_EQ_128-NEXT:    fmov x3, d1
-; VBITS_EQ_128-NEXT:    str x10, [sp, #24] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    fmov x10, d20
-; VBITS_EQ_128-NEXT:    ldr x13, [sp, #16] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d13, d11, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    smulh x8, x14, x10
-; VBITS_EQ_128-NEXT:    smulh x10, x15, x1
-; VBITS_EQ_128-NEXT:    fmov x15, d18
-; VBITS_EQ_128-NEXT:    smulh x14, x16, x9
-; VBITS_EQ_128-NEXT:    mov x9, v22.d[1]
-; VBITS_EQ_128-NEXT:    smulh x16, x17, x30
-; VBITS_EQ_128-NEXT:    stp x11, x8, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    fmov x17, d22
-; VBITS_EQ_128-NEXT:    mov x8, v18.d[1]
-; VBITS_EQ_128-NEXT:    smulh x18, x18, x15
-; VBITS_EQ_128-NEXT:    mov x15, v23.d[1]
-; VBITS_EQ_128-NEXT:    str x10, [sp, #8] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    smulh x4, x4, x17
-; VBITS_EQ_128-NEXT:    fmov d8, x16
-; VBITS_EQ_128-NEXT:    mov x17, v24.d[1]
-; VBITS_EQ_128-NEXT:    smulh x5, x5, x9
-; VBITS_EQ_128-NEXT:    smulh x1, x29, x8
-; VBITS_EQ_128-NEXT:    fmov x8, d23
-; VBITS_EQ_128-NEXT:    fmov x9, d24
-; VBITS_EQ_128-NEXT:    smulh x22, x22, x15
-; VBITS_EQ_128-NEXT:    fmov x15, d17
-; VBITS_EQ_128-NEXT:    fmov d9, x14
-; VBITS_EQ_128-NEXT:    smulh x19, x19, x8
-; VBITS_EQ_128-NEXT:    ldr d14, [sp, #8] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    mov x8, v17.d[1]
-; VBITS_EQ_128-NEXT:    smulh x25, x25, x9
-; VBITS_EQ_128-NEXT:    mov x9, v25.d[1]
-; VBITS_EQ_128-NEXT:    smulh x28, x28, x17
-; VBITS_EQ_128-NEXT:    fmov x17, d25
-; VBITS_EQ_128-NEXT:    smulh x15, x27, x15
-; VBITS_EQ_128-NEXT:    mov x27, v6.d[1]
-; VBITS_EQ_128-NEXT:    ldr d15, [sp, #40] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    smulh x12, x26, x8
-; VBITS_EQ_128-NEXT:    fmov x26, d6
-; VBITS_EQ_128-NEXT:    smulh x17, x24, x17
-; VBITS_EQ_128-NEXT:    ldr x8, [sp] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    mov x24, v26.d[1]
-; VBITS_EQ_128-NEXT:    smulh x11, x23, x9
-; VBITS_EQ_128-NEXT:    fmov x23, d26
-; VBITS_EQ_128-NEXT:    smulh x21, x21, x26
-; VBITS_EQ_128-NEXT:    fmov x26, d0
-; VBITS_EQ_128-NEXT:    smulh x20, x20, x27
-; VBITS_EQ_128-NEXT:    fmov x27, d3
-; VBITS_EQ_128-NEXT:    fmov d20, x17
-; VBITS_EQ_128-NEXT:    smulh x7, x7, x23
-; VBITS_EQ_128-NEXT:    fmov x23, d4
-; VBITS_EQ_128-NEXT:    smulh x6, x6, x24
-; VBITS_EQ_128-NEXT:    fmov x24, d5
-; VBITS_EQ_128-NEXT:    smulh x26, x26, x27
-; VBITS_EQ_128-NEXT:    fmov x27, d7
-; VBITS_EQ_128-NEXT:    smulh x3, x3, x23
-; VBITS_EQ_128-NEXT:    fmov d19, x20
-; VBITS_EQ_128-NEXT:    mov x23, v2.d[1]
-; VBITS_EQ_128-NEXT:    smulh x2, x2, x24
-; VBITS_EQ_128-NEXT:    mov x24, v1.d[1]
-; VBITS_EQ_128-NEXT:    smulh x27, x8, x27
-; VBITS_EQ_128-NEXT:    mov x29, v0.d[1]
-; VBITS_EQ_128-NEXT:    mov x30, v7.d[1]
-; VBITS_EQ_128-NEXT:    mov x8, v5.d[1]
-; VBITS_EQ_128-NEXT:    mov x9, v4.d[1]
-; VBITS_EQ_128-NEXT:    mov x10, v3.d[1]
-; VBITS_EQ_128-NEXT:    ldp d10, d12, [sp, #24] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    smulh x30, x13, x30
-; VBITS_EQ_128-NEXT:    fmov d0, x27
-; VBITS_EQ_128-NEXT:    smulh x8, x23, x8
-; VBITS_EQ_128-NEXT:    fmov d2, x2
-; VBITS_EQ_128-NEXT:    smulh x9, x24, x9
-; VBITS_EQ_128-NEXT:    fmov d4, x3
-; VBITS_EQ_128-NEXT:    smulh x10, x29, x10
-; VBITS_EQ_128-NEXT:    fmov d6, x26
-; VBITS_EQ_128-NEXT:    mov v11.d[1], v10.d[0]
-; VBITS_EQ_128-NEXT:    fmov d1, x30
-; VBITS_EQ_128-NEXT:    mov v13.d[1], v12.d[0]
-; VBITS_EQ_128-NEXT:    mov v15.d[1], v14.d[0]
-; VBITS_EQ_128-NEXT:    mov v9.d[1], v8.d[0]
-; VBITS_EQ_128-NEXT:    fmov d3, x8
-; VBITS_EQ_128-NEXT:    fmov d5, x9
-; VBITS_EQ_128-NEXT:    fmov d7, x10
-; VBITS_EQ_128-NEXT:    fmov d17, x6
-; VBITS_EQ_128-NEXT:    fmov d16, x7
-; VBITS_EQ_128-NEXT:    fmov d18, x21
-; VBITS_EQ_128-NEXT:    fmov d21, x11
-; VBITS_EQ_128-NEXT:    fmov d22, x12
-; VBITS_EQ_128-NEXT:    fmov d23, x15
-; VBITS_EQ_128-NEXT:    fmov d24, x28
-; VBITS_EQ_128-NEXT:    fmov d25, x25
-; VBITS_EQ_128-NEXT:    fmov d26, x22
-; VBITS_EQ_128-NEXT:    fmov d27, x19
-; VBITS_EQ_128-NEXT:    fmov d28, x5
-; VBITS_EQ_128-NEXT:    fmov d29, x4
-; VBITS_EQ_128-NEXT:    fmov d30, x1
-; VBITS_EQ_128-NEXT:    fmov d31, x18
-; VBITS_EQ_128-NEXT:    mov v27.d[1], v26.d[0]
-; VBITS_EQ_128-NEXT:    stp q9, q15, [x0, #192]
-; VBITS_EQ_128-NEXT:    stp q13, q11, [x0, #224]
-; VBITS_EQ_128-NEXT:    mov v31.d[1], v30.d[0]
-; VBITS_EQ_128-NEXT:    mov v29.d[1], v28.d[0]
-; VBITS_EQ_128-NEXT:    mov v25.d[1], v24.d[0]
-; VBITS_EQ_128-NEXT:    mov v23.d[1], v22.d[0]
-; VBITS_EQ_128-NEXT:    mov v20.d[1], v21.d[0]
-; VBITS_EQ_128-NEXT:    mov v18.d[1], v19.d[0]
-; VBITS_EQ_128-NEXT:    stp q29, q31, [x0, #160]
-; VBITS_EQ_128-NEXT:    mov v16.d[1], v17.d[0]
-; VBITS_EQ_128-NEXT:    stp q25, q27, [x0, #128]
-; VBITS_EQ_128-NEXT:    mov v6.d[1], v7.d[0]
-; VBITS_EQ_128-NEXT:    mov v4.d[1], v5.d[0]
-; VBITS_EQ_128-NEXT:    stp q20, q23, [x0, #96]
-; VBITS_EQ_128-NEXT:    mov v2.d[1], v3.d[0]
-; VBITS_EQ_128-NEXT:    mov v0.d[1], v1.d[0]
-; VBITS_EQ_128-NEXT:    stp q16, q18, [x0, #64]
-; VBITS_EQ_128-NEXT:    ldp x20, x19, [sp, #208] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    stp q4, q6, [x0, #32]
-; VBITS_EQ_128-NEXT:    ldp x22, x21, [sp, #192] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    stp q0, q2, [x0]
-; VBITS_EQ_128-NEXT:    ldp x24, x23, [sp, #176] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp x26, x25, [sp, #160] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp x28, x27, [sp, #144] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp x29, x30, [sp, #128] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d9, d8, [sp, #112] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d11, d10, [sp, #96] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d13, d12, [sp, #80] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d15, d14, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add sp, sp, #224
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: smulh_v32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: smulh_v32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %1 = sext <32 x i64> %op1 to <32 x i128>
@@ -1773,35 +666,29 @@ define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 
 ; Don't use SVE for 64-bit vectors.
 ; FIXME: The codegen for the >=256 bits case can be improved.
-define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
-; VBITS_EQ_128-LABEL: umulh_v8i8:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    umull v0.8h, v0.8b, v1.8b
-; VBITS_EQ_128-NEXT:    shrn v0.8b, v0.8h, #8
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: umulh_v8i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    umull v0.8h, v0.8b, v1.8b
-; VBITS_GE_256-NEXT:    ushr v1.8h, v0.8h, #8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[0]
-; VBITS_GE_256-NEXT:    umov w9, v1.h[1]
-; VBITS_GE_256-NEXT:    fmov s0, w8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[2]
-; VBITS_GE_256-NEXT:    mov v0.b[1], w9
-; VBITS_GE_256-NEXT:    mov v0.b[2], w8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[3]
-; VBITS_GE_256-NEXT:    mov v0.b[3], w8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[4]
-; VBITS_GE_256-NEXT:    mov v0.b[4], w8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[5]
-; VBITS_GE_256-NEXT:    mov v0.b[5], w8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[6]
-; VBITS_GE_256-NEXT:    mov v0.b[6], w8
-; VBITS_GE_256-NEXT:    umov w8, v1.h[7]
-; VBITS_GE_256-NEXT:    mov v0.b[7], w8
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; VBITS_GE_256-NEXT:    ret
+define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ushr v1.8h, v0.8h, #8
+; CHECK-NEXT:    umov w8, v1.h[0]
+; CHECK-NEXT:    umov w9, v1.h[1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    umov w8, v1.h[2]
+; CHECK-NEXT:    mov v0.b[1], w9
+; CHECK-NEXT:    mov v0.b[2], w8
+; CHECK-NEXT:    umov w8, v1.h[3]
+; CHECK-NEXT:    mov v0.b[3], w8
+; CHECK-NEXT:    umov w8, v1.h[4]
+; CHECK-NEXT:    mov v0.b[4], w8
+; CHECK-NEXT:    umov w8, v1.h[5]
+; CHECK-NEXT:    mov v0.b[5], w8
+; CHECK-NEXT:    umov w8, v1.h[6]
+; CHECK-NEXT:    mov v0.b[6], w8
+; CHECK-NEXT:    umov w8, v1.h[7]
+; CHECK-NEXT:    mov v0.b[7], w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %1 = zext <8 x i8> %op1 to <8 x i16>
   %2 = zext <8 x i8> %op2 to <8 x i16>
   %mul = mul <8 x i16> %1, %2
@@ -1811,7 +698,7 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umulh_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    umull2 v2.8h, v0.16b, v1.16b
@@ -1826,30 +713,15 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
   ret <16 x i8> %res
 }
 
-define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v32i8:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT:    umull v4.8h, v0.8b, v2.8b
-; VBITS_EQ_128-NEXT:    umull2 v0.8h, v0.16b, v2.16b
-; VBITS_EQ_128-NEXT:    umull v5.8h, v1.8b, v3.8b
-; VBITS_EQ_128-NEXT:    umull2 v1.8h, v1.16b, v3.16b
-; VBITS_EQ_128-NEXT:    shrn v2.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v3.8b, v5.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v2.16b, v0.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v3.16b, v1.8h, #8
-; VBITS_EQ_128-NEXT:    stp q2, q3, [x0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: umulh_v32i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
+define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %1 = zext <32 x i8> %op1 to <32 x i16>
@@ -1862,40 +734,56 @@ define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v64i8:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT:    ldp q2, q5, [x1, #32]
-; VBITS_EQ_128-NEXT:    umull2 v6.8h, v0.16b, v2.16b
-; VBITS_EQ_128-NEXT:    umull v0.8h, v0.8b, v2.8b
-; VBITS_EQ_128-NEXT:    ldp q3, q4, [x0]
-; VBITS_EQ_128-NEXT:    umull2 v7.8h, v1.16b, v5.16b
-; VBITS_EQ_128-NEXT:    umull v1.8h, v1.8b, v5.8b
-; VBITS_EQ_128-NEXT:    shrn v0.8b, v0.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v0.16b, v6.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q2, q5, [x1]
-; VBITS_EQ_128-NEXT:    shrn v1.8b, v1.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v1.16b, v7.8h, #8
-; VBITS_EQ_128-NEXT:    umull2 v16.8h, v3.16b, v2.16b
-; VBITS_EQ_128-NEXT:    umull v2.8h, v3.8b, v2.8b
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT:    umull2 v3.8h, v4.16b, v5.16b
-; VBITS_EQ_128-NEXT:    umull v4.8h, v4.8b, v5.8b
-; VBITS_EQ_128-NEXT:    shrn v2.8b, v2.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v4.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v2.16b, v16.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v4.16b, v3.8h, #8
-; VBITS_EQ_128-NEXT:    stp q2, q4, [x0]
-; VBITS_EQ_128-NEXT:    ret
+; VBITS_GE_256-LABEL: umulh_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    uunpklo z4.h, z0.b
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z5.h, z1.b
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z6.h, z2.b
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z7.h, z3.b
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    uunpklo z2.h, z2.b
+; VBITS_GE_256-NEXT:    uunpklo z3.h, z3.b
+; VBITS_GE_256-NEXT:    mul z0.h, p1/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    movprfx z2, z5
+; VBITS_GE_256-NEXT:    mul z2.h, p1/m, z2.h, z7.h
+; VBITS_GE_256-NEXT:    mul z1.h, p1/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    mul z4.h, p1/m, z4.h, z6.h
+; VBITS_GE_256-NEXT:    lsr z0.h, p1/m, z0.h, #8
+; VBITS_GE_256-NEXT:    movprfx z3, z4
+; VBITS_GE_256-NEXT:    lsr z3.h, p1/m, z3.h, #8
+; VBITS_GE_256-NEXT:    lsr z1.h, p1/m, z1.h, #8
+; VBITS_GE_256-NEXT:    lsr z2.h, p1/m, z2.h, #8
+; VBITS_GE_256-NEXT:    uzp1 z3.b, z3.b, z3.b
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
+; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    splice z3.b, p1, z3.b, z0.b
+; VBITS_GE_256-NEXT:    splice z2.b, p1, z2.b, z1.b
+; VBITS_GE_256-NEXT:    st1b { z3.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; VBITS_GE_1024-LABEL: umulh_v64i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.b, vl64
-; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_1024-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+; VBITS_GE_512-LABEL: umulh_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %1 = zext <64 x i8> %op1 to <64 x i16>
@@ -1907,64 +795,15 @@ define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v128i8:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    ldp q2, q5, [x1, #96]
-; VBITS_EQ_128-NEXT:    umull2 v6.8h, v0.16b, v2.16b
-; VBITS_EQ_128-NEXT:    umull v0.8h, v0.8b, v2.8b
-; VBITS_EQ_128-NEXT:    ldp q3, q4, [x0, #64]
-; VBITS_EQ_128-NEXT:    umull2 v7.8h, v1.16b, v5.16b
-; VBITS_EQ_128-NEXT:    umull v1.8h, v1.8b, v5.8b
-; VBITS_EQ_128-NEXT:    shrn v0.8b, v0.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v0.16b, v6.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q2, q16, [x1, #64]
-; VBITS_EQ_128-NEXT:    shrn v1.8b, v1.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v1.16b, v7.8h, #8
-; VBITS_EQ_128-NEXT:    umull2 v17.8h, v3.16b, v2.16b
-; VBITS_EQ_128-NEXT:    umull v2.8h, v3.8b, v2.8b
-; VBITS_EQ_128-NEXT:    ldp q5, q18, [x0, #32]
-; VBITS_EQ_128-NEXT:    umull2 v19.8h, v4.16b, v16.16b
-; VBITS_EQ_128-NEXT:    umull v4.8h, v4.8b, v16.8b
-; VBITS_EQ_128-NEXT:    shrn v2.8b, v2.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v2.16b, v17.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q3, q20, [x1, #32]
-; VBITS_EQ_128-NEXT:    shrn v4.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v4.16b, v19.8h, #8
-; VBITS_EQ_128-NEXT:    umull2 v21.8h, v5.16b, v3.16b
-; VBITS_EQ_128-NEXT:    umull v3.8h, v5.8b, v3.8b
-; VBITS_EQ_128-NEXT:    ldp q16, q22, [x0]
-; VBITS_EQ_128-NEXT:    umull2 v23.8h, v18.16b, v20.16b
-; VBITS_EQ_128-NEXT:    umull v18.8h, v18.8b, v20.8b
-; VBITS_EQ_128-NEXT:    shrn v3.8b, v3.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v3.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q5, q24, [x1]
-; VBITS_EQ_128-NEXT:    shrn v18.8b, v18.8h, #8
-; VBITS_EQ_128-NEXT:    stp q2, q4, [x0, #64]
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    shrn2 v18.16b, v23.8h, #8
-; VBITS_EQ_128-NEXT:    umull v20.8h, v16.8b, v5.8b
-; VBITS_EQ_128-NEXT:    umull2 v5.8h, v16.16b, v5.16b
-; VBITS_EQ_128-NEXT:    stp q3, q18, [x0, #32]
-; VBITS_EQ_128-NEXT:    umull v25.8h, v22.8b, v24.8b
-; VBITS_EQ_128-NEXT:    umull2 v16.8h, v22.16b, v24.16b
-; VBITS_EQ_128-NEXT:    shrn v20.8b, v20.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v22.8b, v25.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v20.16b, v5.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v22.16b, v16.8h, #8
-; VBITS_EQ_128-NEXT:    stp q20, q22, [x0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: umulh_v128i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_1024-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: umulh_v128i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %insert = insertelement <128 x i16> undef, i16 8, i64 0
@@ -1978,130 +817,15 @@ define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v256i8:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    sub sp, sp, #96
-; VBITS_EQ_128-NEXT:    .cfi_def_cfa_offset 96
-; VBITS_EQ_128-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_offset b8, -8
-; VBITS_EQ_128-NEXT:    .cfi_offset b9, -16
-; VBITS_EQ_128-NEXT:    .cfi_offset b10, -24
-; VBITS_EQ_128-NEXT:    .cfi_offset b11, -32
-; VBITS_EQ_128-NEXT:    .cfi_offset b12, -40
-; VBITS_EQ_128-NEXT:    .cfi_offset b13, -48
-; VBITS_EQ_128-NEXT:    .cfi_offset b14, -56
-; VBITS_EQ_128-NEXT:    .cfi_offset b15, -64
-; VBITS_EQ_128-NEXT:    ldp q2, q1, [x0, #224]
-; VBITS_EQ_128-NEXT:    ldp q6, q3, [x1, #224]
-; VBITS_EQ_128-NEXT:    ldp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT:    umull2 v0.8h, v1.16b, v3.16b
-; VBITS_EQ_128-NEXT:    umull v4.8h, v1.8b, v3.8b
-; VBITS_EQ_128-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    ldp q16, q3, [x1, #192]
-; VBITS_EQ_128-NEXT:    umull2 v0.8h, v2.16b, v6.16b
-; VBITS_EQ_128-NEXT:    shrn v4.8b, v4.8h, #8
-; VBITS_EQ_128-NEXT:    umull v6.8h, v2.8b, v6.8b
-; VBITS_EQ_128-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    umull2 v2.8h, v5.16b, v3.16b
-; VBITS_EQ_128-NEXT:    shrn v6.8b, v6.8h, #8
-; VBITS_EQ_128-NEXT:    umull v5.8h, v5.8b, v3.8b
-; VBITS_EQ_128-NEXT:    ldp q19, q18, [x0, #160]
-; VBITS_EQ_128-NEXT:    umull2 v3.8h, v7.16b, v16.16b
-; VBITS_EQ_128-NEXT:    umull v7.8h, v7.8b, v16.8b
-; VBITS_EQ_128-NEXT:    shrn v5.8b, v5.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v5.16b, v2.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q16, q17, [x1, #160]
-; VBITS_EQ_128-NEXT:    shrn v7.8b, v7.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v7.16b, v3.8h, #8
-; VBITS_EQ_128-NEXT:    umull2 v31.8h, v19.16b, v16.16b
-; VBITS_EQ_128-NEXT:    umull v9.8h, v19.8b, v16.8b
-; VBITS_EQ_128-NEXT:    umull2 v21.8h, v18.16b, v17.16b
-; VBITS_EQ_128-NEXT:    umull v30.8h, v18.8b, v17.8b
-; VBITS_EQ_128-NEXT:    ldp q22, q17, [x0, #128]
-; VBITS_EQ_128-NEXT:    shrn v9.8b, v9.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v30.8b, v30.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v9.16b, v31.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v30.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q19, q20, [x1, #128]
-; VBITS_EQ_128-NEXT:    umull2 v16.8h, v17.16b, v20.16b
-; VBITS_EQ_128-NEXT:    ldr q21, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    umull v18.8h, v17.8b, v20.8b
-; VBITS_EQ_128-NEXT:    ldp q24, q20, [x0, #96]
-; VBITS_EQ_128-NEXT:    umull2 v17.8h, v22.16b, v19.16b
-; VBITS_EQ_128-NEXT:    shrn2 v4.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT:    umull v19.8h, v22.8b, v19.8b
-; VBITS_EQ_128-NEXT:    shrn v2.8b, v18.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v2.16b, v16.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q22, q23, [x1, #96]
-; VBITS_EQ_128-NEXT:    shrn v3.8b, v19.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v3.16b, v17.8h, #8
-; VBITS_EQ_128-NEXT:    umull2 v12.8h, v24.16b, v22.16b
-; VBITS_EQ_128-NEXT:    umull v13.8h, v24.8b, v22.8b
-; VBITS_EQ_128-NEXT:    umull2 v10.8h, v20.16b, v23.16b
-; VBITS_EQ_128-NEXT:    ldr q21, [sp] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    umull v11.8h, v20.8b, v23.8b
-; VBITS_EQ_128-NEXT:    ldp q26, q23, [x0, #64]
-; VBITS_EQ_128-NEXT:    shrn2 v6.16b, v21.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q24, q25, [x1, #64]
-; VBITS_EQ_128-NEXT:    umull2 v22.8h, v26.16b, v24.16b
-; VBITS_EQ_128-NEXT:    umull v24.8h, v26.8b, v24.8b
-; VBITS_EQ_128-NEXT:    umull2 v20.8h, v23.16b, v25.16b
-; VBITS_EQ_128-NEXT:    umull v23.8h, v23.8b, v25.8b
-; VBITS_EQ_128-NEXT:    ldp q28, q25, [x0, #32]
-; VBITS_EQ_128-NEXT:    ldp q26, q27, [x1, #32]
-; VBITS_EQ_128-NEXT:    umull2 v15.8h, v28.16b, v26.16b
-; VBITS_EQ_128-NEXT:    umull v1.8h, v28.8b, v26.8b
-; VBITS_EQ_128-NEXT:    umull2 v14.8h, v25.16b, v27.16b
-; VBITS_EQ_128-NEXT:    umull v8.8h, v25.8b, v27.8b
-; VBITS_EQ_128-NEXT:    ldp q0, q27, [x0]
-; VBITS_EQ_128-NEXT:    shrn v8.8b, v8.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v8.16b, v14.8h, #8
-; VBITS_EQ_128-NEXT:    ldp q28, q29, [x1]
-; VBITS_EQ_128-NEXT:    stp q3, q2, [x0, #128]
-; VBITS_EQ_128-NEXT:    shrn v2.8b, v23.8h, #8
-; VBITS_EQ_128-NEXT:    stp q9, q30, [x0, #160]
-; VBITS_EQ_128-NEXT:    shrn v3.8b, v24.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v2.16b, v20.8h, #8
-; VBITS_EQ_128-NEXT:    stp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT:    umull2 v26.8h, v0.16b, v28.16b
-; VBITS_EQ_128-NEXT:    shrn2 v3.16b, v22.8h, #8
-; VBITS_EQ_128-NEXT:    umull v28.8h, v0.8b, v28.8b
-; VBITS_EQ_128-NEXT:    stp q6, q4, [x0, #224]
-; VBITS_EQ_128-NEXT:    umull2 v25.8h, v27.16b, v29.16b
-; VBITS_EQ_128-NEXT:    stp q3, q2, [x0, #64]
-; VBITS_EQ_128-NEXT:    umull v27.8h, v27.8b, v29.8b
-; VBITS_EQ_128-NEXT:    shrn v29.8b, v1.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v0.8b, v13.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v1.8b, v11.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v29.16b, v15.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v0.16b, v12.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v1.16b, v10.8h, #8
-; VBITS_EQ_128-NEXT:    stp q29, q8, [x0, #32]
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    shrn v0.8b, v27.8h, #8
-; VBITS_EQ_128-NEXT:    shrn v1.8b, v28.8h, #8
-; VBITS_EQ_128-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    shrn2 v0.16b, v25.8h, #8
-; VBITS_EQ_128-NEXT:    shrn2 v1.16b, v26.8h, #8
-; VBITS_EQ_128-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    stp q1, q0, [x0]
-; VBITS_EQ_128-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add sp, sp, #96
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: umulh_v256i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_2048-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: umulh_v256i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %1 = zext <256 x i8> %op1 to <256 x i16>
@@ -2115,26 +839,20 @@ define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 
 ; Don't use SVE for 64-bit vectors.
 ; FIXME: The codegen for the >=256 bits case can be improved.
-define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
-; VBITS_EQ_128-LABEL: umulh_v4i16:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    umull v0.4s, v0.4h, v1.4h
-; VBITS_EQ_128-NEXT:    shrn v0.4h, v0.4s, #16
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: umulh_v4i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    umull v0.4s, v0.4h, v1.4h
-; VBITS_GE_256-NEXT:    ushr v1.4s, v0.4s, #16
-; VBITS_GE_256-NEXT:    mov w8, v1.s[1]
-; VBITS_GE_256-NEXT:    mov w9, v1.s[2]
-; VBITS_GE_256-NEXT:    mov v0.16b, v1.16b
-; VBITS_GE_256-NEXT:    mov v0.h[1], w8
-; VBITS_GE_256-NEXT:    mov w8, v1.s[3]
-; VBITS_GE_256-NEXT:    mov v0.h[2], w9
-; VBITS_GE_256-NEXT:    mov v0.h[3], w8
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; VBITS_GE_256-NEXT:    ret
+define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    ushr v1.4s, v0.4s, #16
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    mov w9, v1.s[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.h[1], w8
+; CHECK-NEXT:    mov w8, v1.s[3]
+; CHECK-NEXT:    mov v0.h[2], w9
+; CHECK-NEXT:    mov v0.h[3], w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %1 = zext <4 x i16> %op1 to <4 x i32>
   %2 = zext <4 x i16> %op2 to <4 x i32>
   %mul = mul <4 x i32> %1, %2
@@ -2144,7 +862,7 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umulh_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    umull2 v2.4s, v0.8h, v1.8h
@@ -2159,30 +877,15 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
   ret <8 x i16> %res
 }
 
-define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v16i16:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT:    umull v4.4s, v0.4h, v2.4h
-; VBITS_EQ_128-NEXT:    umull2 v0.4s, v0.8h, v2.8h
-; VBITS_EQ_128-NEXT:    umull v5.4s, v1.4h, v3.4h
-; VBITS_EQ_128-NEXT:    umull2 v1.4s, v1.8h, v3.8h
-; VBITS_EQ_128-NEXT:    shrn v2.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v3.4h, v5.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v2.8h, v0.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v3.8h, v1.4s, #16
-; VBITS_EQ_128-NEXT:    stp q2, q3, [x0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: umulh_v16i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
+define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %1 = zext <16 x i16> %op1 to <16 x i32>
@@ -2195,40 +898,47 @@ define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v32i16:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT:    ldp q2, q5, [x1, #32]
-; VBITS_EQ_128-NEXT:    umull2 v6.4s, v0.8h, v2.8h
-; VBITS_EQ_128-NEXT:    umull v0.4s, v0.4h, v2.4h
-; VBITS_EQ_128-NEXT:    ldp q3, q4, [x0]
-; VBITS_EQ_128-NEXT:    umull2 v7.4s, v1.8h, v5.8h
-; VBITS_EQ_128-NEXT:    umull v1.4s, v1.4h, v5.4h
-; VBITS_EQ_128-NEXT:    shrn v0.4h, v0.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v0.8h, v6.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q2, q5, [x1]
-; VBITS_EQ_128-NEXT:    shrn v1.4h, v1.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v1.8h, v7.4s, #16
-; VBITS_EQ_128-NEXT:    umull2 v16.4s, v3.8h, v2.8h
-; VBITS_EQ_128-NEXT:    umull v2.4s, v3.4h, v2.4h
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT:    umull2 v3.4s, v4.8h, v5.8h
-; VBITS_EQ_128-NEXT:    umull v4.4s, v4.4h, v5.4h
-; VBITS_EQ_128-NEXT:    shrn v2.4h, v2.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v4.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v2.8h, v16.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v4.8h, v3.4s, #16
-; VBITS_EQ_128-NEXT:    stp q2, q4, [x0]
-; VBITS_EQ_128-NEXT:    ret
+; VBITS_GE_256-LABEL: umulh_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov z7.d, z1.d
+; VBITS_GE_256-NEXT:    mov z16.d, z3.d
+; VBITS_GE_256-NEXT:    ext z7.b, z7.b, z7.b, #16
+; VBITS_GE_256-NEXT:    umull2 v4.4s, v0.8h, v2.8h
+; VBITS_GE_256-NEXT:    ext z16.b, z16.b, z3.b, #16
+; VBITS_GE_256-NEXT:    umull v5.4s, v0.4h, v2.4h
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    umull2 v6.4s, v1.8h, v3.8h
+; VBITS_GE_256-NEXT:    umull v1.4s, v1.4h, v3.4h
+; VBITS_GE_256-NEXT:    umull2 v3.4s, v0.8h, v2.8h
+; VBITS_GE_256-NEXT:    umull v0.4s, v0.4h, v2.4h
+; VBITS_GE_256-NEXT:    umull2 v2.4s, v7.8h, v16.8h
+; VBITS_GE_256-NEXT:    umull v7.4s, v7.4h, v16.4h
+; VBITS_GE_256-NEXT:    uzp2 v4.8h, v5.8h, v4.8h
+; VBITS_GE_256-NEXT:    uzp2 v1.8h, v1.8h, v6.8h
+; VBITS_GE_256-NEXT:    uzp2 v0.8h, v0.8h, v3.8h
+; VBITS_GE_256-NEXT:    uzp2 v2.8h, v7.8h, v2.8h
+; VBITS_GE_256-NEXT:    splice z4.h, p1, z4.h, z0.h
+; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z2.h
+; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; VBITS_GE_1024-LABEL: umulh_v32i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+; VBITS_GE_512-LABEL: umulh_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %1 = zext <32 x i16> %op1 to <32 x i32>
@@ -2240,63 +950,15 @@ define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v64i16:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    ldp q2, q5, [x1, #96]
-; VBITS_EQ_128-NEXT:    umull2 v6.4s, v0.8h, v2.8h
-; VBITS_EQ_128-NEXT:    umull v0.4s, v0.4h, v2.4h
-; VBITS_EQ_128-NEXT:    ldp q3, q4, [x0, #64]
-; VBITS_EQ_128-NEXT:    umull2 v7.4s, v1.8h, v5.8h
-; VBITS_EQ_128-NEXT:    umull v1.4s, v1.4h, v5.4h
-; VBITS_EQ_128-NEXT:    shrn v0.4h, v0.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v0.8h, v6.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q2, q16, [x1, #64]
-; VBITS_EQ_128-NEXT:    shrn v1.4h, v1.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v1.8h, v7.4s, #16
-; VBITS_EQ_128-NEXT:    umull2 v17.4s, v3.8h, v2.8h
-; VBITS_EQ_128-NEXT:    umull v2.4s, v3.4h, v2.4h
-; VBITS_EQ_128-NEXT:    ldp q5, q18, [x0, #32]
-; VBITS_EQ_128-NEXT:    umull2 v19.4s, v4.8h, v16.8h
-; VBITS_EQ_128-NEXT:    umull v4.4s, v4.4h, v16.4h
-; VBITS_EQ_128-NEXT:    shrn v2.4h, v2.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v2.8h, v17.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q3, q20, [x1, #32]
-; VBITS_EQ_128-NEXT:    shrn v4.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v4.8h, v19.4s, #16
-; VBITS_EQ_128-NEXT:    umull2 v21.4s, v5.8h, v3.8h
-; VBITS_EQ_128-NEXT:    umull v3.4s, v5.4h, v3.4h
-; VBITS_EQ_128-NEXT:    ldp q16, q22, [x0]
-; VBITS_EQ_128-NEXT:    umull2 v23.4s, v18.8h, v20.8h
-; VBITS_EQ_128-NEXT:    umull v18.4s, v18.4h, v20.4h
-; VBITS_EQ_128-NEXT:    shrn v3.4h, v3.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v3.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q5, q24, [x1]
-; VBITS_EQ_128-NEXT:    shrn v18.4h, v18.4s, #16
-; VBITS_EQ_128-NEXT:    stp q2, q4, [x0, #64]
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    shrn2 v18.8h, v23.4s, #16
-; VBITS_EQ_128-NEXT:    umull v20.4s, v16.4h, v5.4h
-; VBITS_EQ_128-NEXT:    umull2 v5.4s, v16.8h, v5.8h
-; VBITS_EQ_128-NEXT:    stp q3, q18, [x0, #32]
-; VBITS_EQ_128-NEXT:    umull v25.4s, v22.4h, v24.4h
-; VBITS_EQ_128-NEXT:    umull2 v16.4s, v22.8h, v24.8h
-; VBITS_EQ_128-NEXT:    shrn v20.4h, v20.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v22.4h, v25.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v20.8h, v5.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v22.8h, v16.4s, #16
-; VBITS_EQ_128-NEXT:    stp q20, q22, [x0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: umulh_v64i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: umulh_v64i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %1 = zext <64 x i16> %op1 to <64 x i32>
@@ -2308,130 +970,15 @@ define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v128i16:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    sub sp, sp, #96
-; VBITS_EQ_128-NEXT:    .cfi_def_cfa_offset 96
-; VBITS_EQ_128-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_offset b8, -8
-; VBITS_EQ_128-NEXT:    .cfi_offset b9, -16
-; VBITS_EQ_128-NEXT:    .cfi_offset b10, -24
-; VBITS_EQ_128-NEXT:    .cfi_offset b11, -32
-; VBITS_EQ_128-NEXT:    .cfi_offset b12, -40
-; VBITS_EQ_128-NEXT:    .cfi_offset b13, -48
-; VBITS_EQ_128-NEXT:    .cfi_offset b14, -56
-; VBITS_EQ_128-NEXT:    .cfi_offset b15, -64
-; VBITS_EQ_128-NEXT:    ldp q2, q1, [x0, #224]
-; VBITS_EQ_128-NEXT:    ldp q6, q3, [x1, #224]
-; VBITS_EQ_128-NEXT:    ldp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT:    umull2 v0.4s, v1.8h, v3.8h
-; VBITS_EQ_128-NEXT:    umull v4.4s, v1.4h, v3.4h
-; VBITS_EQ_128-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    ldp q16, q3, [x1, #192]
-; VBITS_EQ_128-NEXT:    umull2 v0.4s, v2.8h, v6.8h
-; VBITS_EQ_128-NEXT:    shrn v4.4h, v4.4s, #16
-; VBITS_EQ_128-NEXT:    umull v6.4s, v2.4h, v6.4h
-; VBITS_EQ_128-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    umull2 v2.4s, v5.8h, v3.8h
-; VBITS_EQ_128-NEXT:    shrn v6.4h, v6.4s, #16
-; VBITS_EQ_128-NEXT:    umull v5.4s, v5.4h, v3.4h
-; VBITS_EQ_128-NEXT:    ldp q19, q18, [x0, #160]
-; VBITS_EQ_128-NEXT:    umull2 v3.4s, v7.8h, v16.8h
-; VBITS_EQ_128-NEXT:    umull v7.4s, v7.4h, v16.4h
-; VBITS_EQ_128-NEXT:    shrn v5.4h, v5.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v5.8h, v2.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q16, q17, [x1, #160]
-; VBITS_EQ_128-NEXT:    shrn v7.4h, v7.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v7.8h, v3.4s, #16
-; VBITS_EQ_128-NEXT:    umull2 v31.4s, v19.8h, v16.8h
-; VBITS_EQ_128-NEXT:    umull v9.4s, v19.4h, v16.4h
-; VBITS_EQ_128-NEXT:    umull2 v21.4s, v18.8h, v17.8h
-; VBITS_EQ_128-NEXT:    umull v30.4s, v18.4h, v17.4h
-; VBITS_EQ_128-NEXT:    ldp q22, q17, [x0, #128]
-; VBITS_EQ_128-NEXT:    shrn v9.4h, v9.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v30.4h, v30.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v9.8h, v31.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v30.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q19, q20, [x1, #128]
-; VBITS_EQ_128-NEXT:    umull2 v16.4s, v17.8h, v20.8h
-; VBITS_EQ_128-NEXT:    ldr q21, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    umull v18.4s, v17.4h, v20.4h
-; VBITS_EQ_128-NEXT:    ldp q24, q20, [x0, #96]
-; VBITS_EQ_128-NEXT:    umull2 v17.4s, v22.8h, v19.8h
-; VBITS_EQ_128-NEXT:    shrn2 v4.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT:    umull v19.4s, v22.4h, v19.4h
-; VBITS_EQ_128-NEXT:    shrn v2.4h, v18.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v2.8h, v16.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q22, q23, [x1, #96]
-; VBITS_EQ_128-NEXT:    shrn v3.4h, v19.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v3.8h, v17.4s, #16
-; VBITS_EQ_128-NEXT:    umull2 v12.4s, v24.8h, v22.8h
-; VBITS_EQ_128-NEXT:    umull v13.4s, v24.4h, v22.4h
-; VBITS_EQ_128-NEXT:    umull2 v10.4s, v20.8h, v23.8h
-; VBITS_EQ_128-NEXT:    ldr q21, [sp] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    umull v11.4s, v20.4h, v23.4h
-; VBITS_EQ_128-NEXT:    ldp q26, q23, [x0, #64]
-; VBITS_EQ_128-NEXT:    shrn2 v6.8h, v21.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q24, q25, [x1, #64]
-; VBITS_EQ_128-NEXT:    umull2 v22.4s, v26.8h, v24.8h
-; VBITS_EQ_128-NEXT:    umull v24.4s, v26.4h, v24.4h
-; VBITS_EQ_128-NEXT:    umull2 v20.4s, v23.8h, v25.8h
-; VBITS_EQ_128-NEXT:    umull v23.4s, v23.4h, v25.4h
-; VBITS_EQ_128-NEXT:    ldp q28, q25, [x0, #32]
-; VBITS_EQ_128-NEXT:    ldp q26, q27, [x1, #32]
-; VBITS_EQ_128-NEXT:    umull2 v15.4s, v28.8h, v26.8h
-; VBITS_EQ_128-NEXT:    umull v1.4s, v28.4h, v26.4h
-; VBITS_EQ_128-NEXT:    umull2 v14.4s, v25.8h, v27.8h
-; VBITS_EQ_128-NEXT:    umull v8.4s, v25.4h, v27.4h
-; VBITS_EQ_128-NEXT:    ldp q0, q27, [x0]
-; VBITS_EQ_128-NEXT:    shrn v8.4h, v8.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v8.8h, v14.4s, #16
-; VBITS_EQ_128-NEXT:    ldp q28, q29, [x1]
-; VBITS_EQ_128-NEXT:    stp q3, q2, [x0, #128]
-; VBITS_EQ_128-NEXT:    shrn v2.4h, v23.4s, #16
-; VBITS_EQ_128-NEXT:    stp q9, q30, [x0, #160]
-; VBITS_EQ_128-NEXT:    shrn v3.4h, v24.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v2.8h, v20.4s, #16
-; VBITS_EQ_128-NEXT:    stp q7, q5, [x0, #192]
-; VBITS_EQ_128-NEXT:    umull2 v26.4s, v0.8h, v28.8h
-; VBITS_EQ_128-NEXT:    shrn2 v3.8h, v22.4s, #16
-; VBITS_EQ_128-NEXT:    umull v28.4s, v0.4h, v28.4h
-; VBITS_EQ_128-NEXT:    stp q6, q4, [x0, #224]
-; VBITS_EQ_128-NEXT:    umull2 v25.4s, v27.8h, v29.8h
-; VBITS_EQ_128-NEXT:    stp q3, q2, [x0, #64]
-; VBITS_EQ_128-NEXT:    umull v27.4s, v27.4h, v29.4h
-; VBITS_EQ_128-NEXT:    shrn v29.4h, v1.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v0.4h, v13.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v1.4h, v11.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v29.8h, v15.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v0.8h, v12.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v1.8h, v10.4s, #16
-; VBITS_EQ_128-NEXT:    stp q29, q8, [x0, #32]
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    shrn v0.4h, v27.4s, #16
-; VBITS_EQ_128-NEXT:    shrn v1.4h, v28.4s, #16
-; VBITS_EQ_128-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    shrn2 v0.8h, v25.4s, #16
-; VBITS_EQ_128-NEXT:    shrn2 v1.8h, v26.4s, #16
-; VBITS_EQ_128-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    stp q1, q0, [x0]
-; VBITS_EQ_128-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add sp, sp, #96
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: umulh_v128i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: umulh_v128i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %1 = zext <128 x i16> %op1 to <128 x i32>
@@ -2444,7 +991,7 @@ define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
-define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umulh_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
@@ -2453,8 +1000,6 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
 ; CHECK-NEXT:    ret
-
-
   %1 = zext <2 x i32> %op1 to <2 x i64>
   %2 = zext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
@@ -2464,7 +1009,7 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umulh_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    umull2 v2.2d, v0.4s, v1.4s
@@ -2479,39 +1024,15 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
   ret <4 x i32> %res
 }
 
-define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v8i32:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT:    ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT:    ushll v5.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT:    ushll v4.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v1.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v7.2d, v2.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v2.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v6.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT:    mul z5.d, p0/m, z5.d, z7.d
-; VBITS_EQ_128-NEXT:    ushll2 v3.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z2.d
-; VBITS_EQ_128-NEXT:    mul z4.d, p0/m, z4.d, z6.d
-; VBITS_EQ_128-NEXT:    shrn v5.2s, v5.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v2.2s, v4.2d, #32
-; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z3.d
-; VBITS_EQ_128-NEXT:    shrn2 v5.4s, v0.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v2.4s, v1.2d, #32
-; VBITS_EQ_128-NEXT:    stp q5, q2, [x0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: umulh_v8i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
+define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %insert = insertelement <8 x i64> undef, i64 32, i64 0
@@ -2526,57 +1047,47 @@ define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v16i32:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q1, q2, [x0, #32]
-; VBITS_EQ_128-NEXT:    ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT:    ushll v19.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v1.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q3, q4, [x0]
-; VBITS_EQ_128-NEXT:    ushll v18.2d, v2.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v2.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v7.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v3.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q5, q6, [x1, #32]
-; VBITS_EQ_128-NEXT:    ushll v0.2d, v4.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v4.2d, v4.4s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v21.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v5.2d, v5.2s, #0
-; VBITS_EQ_128-NEXT:    ldp q16, q17, [x1]
-; VBITS_EQ_128-NEXT:    ushll2 v22.2d, v6.4s, #0
-; VBITS_EQ_128-NEXT:    mul z5.d, p0/m, z5.d, z19.d
-; VBITS_EQ_128-NEXT:    ushll v6.2d, v6.2s, #0
-; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z21.d
-; VBITS_EQ_128-NEXT:    shrn v5.2s, v5.2d, #32
-; VBITS_EQ_128-NEXT:    mul z2.d, p0/m, z2.d, z22.d
-; VBITS_EQ_128-NEXT:    ushll v19.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT:    mul z6.d, p0/m, z6.d, z18.d
-; VBITS_EQ_128-NEXT:    ushll2 v16.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v20.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT:    mul z7.d, p0/m, z7.d, z19.d
-; VBITS_EQ_128-NEXT:    ushll2 v17.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT:    mul z3.d, p0/m, z3.d, z16.d
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z20.d
-; VBITS_EQ_128-NEXT:    shrn v6.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v7.2s, v7.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v0.2s, v0.2d, #32
-; VBITS_EQ_128-NEXT:    mul z4.d, p0/m, z4.d, z17.d
-; VBITS_EQ_128-NEXT:    shrn2 v5.4s, v1.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v6.4s, v2.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v7.4s, v3.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v0.4s, v4.2d, #32
-; VBITS_EQ_128-NEXT:    stp q5, q6, [x0, #32]
-; VBITS_EQ_128-NEXT:    stp q7, q0, [x0]
-; VBITS_EQ_128-NEXT:    ret
+; VBITS_GE_256-LABEL: umulh_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov z7.d, z1.d
+; VBITS_GE_256-NEXT:    mov z16.d, z3.d
+; VBITS_GE_256-NEXT:    ext z7.b, z7.b, z7.b, #16
+; VBITS_GE_256-NEXT:    umull2 v4.2d, v0.4s, v2.4s
+; VBITS_GE_256-NEXT:    ext z16.b, z16.b, z3.b, #16
+; VBITS_GE_256-NEXT:    umull v5.2d, v0.2s, v2.2s
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    umull2 v6.2d, v1.4s, v3.4s
+; VBITS_GE_256-NEXT:    umull v1.2d, v1.2s, v3.2s
+; VBITS_GE_256-NEXT:    umull2 v3.2d, v0.4s, v2.4s
+; VBITS_GE_256-NEXT:    umull v0.2d, v0.2s, v2.2s
+; VBITS_GE_256-NEXT:    umull2 v2.2d, v7.4s, v16.4s
+; VBITS_GE_256-NEXT:    umull v7.2d, v7.2s, v16.2s
+; VBITS_GE_256-NEXT:    uzp2 v4.4s, v5.4s, v4.4s
+; VBITS_GE_256-NEXT:    uzp2 v1.4s, v1.4s, v6.4s
+; VBITS_GE_256-NEXT:    uzp2 v0.4s, v0.4s, v3.4s
+; VBITS_GE_256-NEXT:    uzp2 v2.4s, v7.4s, v2.4s
+; VBITS_GE_256-NEXT:    splice z4.s, p1, z4.s, z0.s
+; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z2.s
+; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; VBITS_GE_1024-LABEL: umulh_v16i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+; VBITS_GE_512-LABEL: umulh_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %1 = zext <16 x i32> %op1 to <16 x i64>
@@ -2588,104 +1099,15 @@ define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v32i32:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_def_cfa_offset 32
-; VBITS_EQ_128-NEXT:    stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_offset b8, -8
-; VBITS_EQ_128-NEXT:    .cfi_offset b9, -16
-; VBITS_EQ_128-NEXT:    .cfi_offset b10, -32
-; VBITS_EQ_128-NEXT:    ldp q17, q16, [x0, #64]
-; VBITS_EQ_128-NEXT:    ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT:    ushll v27.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v29.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q23, q28, [x0, #96]
-; VBITS_EQ_128-NEXT:    ushll v19.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v22.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v31.2d, v23.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v8.2d, v23.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q26, q25, [x1, #96]
-; VBITS_EQ_128-NEXT:    ushll v30.2d, v28.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v28.2d, v28.4s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v9.2d, v26.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v26.2d, v26.2s, #0
-; VBITS_EQ_128-NEXT:    ldp q24, q21, [x1, #64]
-; VBITS_EQ_128-NEXT:    mul z26.d, p0/m, z26.d, z31.d
-; VBITS_EQ_128-NEXT:    mul z8.d, p0/m, z8.d, z9.d
-; VBITS_EQ_128-NEXT:    ushll2 v10.2d, v25.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v25.2d, v25.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v31.2d, v24.4s, #0
-; VBITS_EQ_128-NEXT:    mul z28.d, p0/m, z28.d, z10.d
-; VBITS_EQ_128-NEXT:    ushll v24.2d, v24.2s, #0
-; VBITS_EQ_128-NEXT:    mul z25.d, p0/m, z25.d, z30.d
-; VBITS_EQ_128-NEXT:    ldp q7, q5, [x0, #32]
-; VBITS_EQ_128-NEXT:    mul z24.d, p0/m, z24.d, z27.d
-; VBITS_EQ_128-NEXT:    mul z29.d, p0/m, z29.d, z31.d
-; VBITS_EQ_128-NEXT:    ushll2 v30.2d, v21.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v21.2d, v21.2s, #0
-; VBITS_EQ_128-NEXT:    ushll v6.2d, v7.2s, #0
-; VBITS_EQ_128-NEXT:    mul z22.d, p0/m, z22.d, z30.d
-; VBITS_EQ_128-NEXT:    mul z19.d, p0/m, z19.d, z21.d
-; VBITS_EQ_128-NEXT:    ldp q20, q18, [x1, #32]
-; VBITS_EQ_128-NEXT:    ushll v4.2d, v5.2s, #0
-; VBITS_EQ_128-NEXT:    shrn v19.2s, v19.2d, #32
-; VBITS_EQ_128-NEXT:    ushll2 v5.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v7.2d, v7.4s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v27.2d, v20.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v20.2d, v20.2s, #0
-; VBITS_EQ_128-NEXT:    ldp q3, q1, [x0]
-; VBITS_EQ_128-NEXT:    mul z6.d, p0/m, z6.d, z20.d
-; VBITS_EQ_128-NEXT:    mul z7.d, p0/m, z7.d, z27.d
-; VBITS_EQ_128-NEXT:    ushll2 v21.2d, v18.4s, #0
-; VBITS_EQ_128-NEXT:    shrn v6.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT:    ushll v18.2d, v18.2s, #0
-; VBITS_EQ_128-NEXT:    ushll v2.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT:    mul z5.d, p0/m, z5.d, z21.d
-; VBITS_EQ_128-NEXT:    ushll2 v3.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT:    mul z4.d, p0/m, z4.d, z18.d
-; VBITS_EQ_128-NEXT:    ldp q16, q17, [x1]
-; VBITS_EQ_128-NEXT:    ushll v0.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT:    shrn v4.2s, v4.2d, #32
-; VBITS_EQ_128-NEXT:    ushll2 v1.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT:    shrn v18.2s, v24.2d, #32
-; VBITS_EQ_128-NEXT:    ushll v20.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT:    shrn2 v19.4s, v22.2d, #32
-; VBITS_EQ_128-NEXT:    ushll2 v16.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v23.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT:    mul z2.d, p0/m, z2.d, z20.d
-; VBITS_EQ_128-NEXT:    ushll2 v17.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT:    mul z3.d, p0/m, z3.d, z16.d
-; VBITS_EQ_128-NEXT:    shrn v16.2s, v26.2d, #32
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z23.d
-; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z17.d
-; VBITS_EQ_128-NEXT:    shrn v0.2s, v0.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v2.2s, v2.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v17.2s, v25.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v16.4s, v8.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v18.4s, v29.2d, #32
-; VBITS_EQ_128-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    shrn2 v17.4s, v28.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v6.4s, v7.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v4.4s, v5.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v2.4s, v3.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v0.4s, v1.2d, #32
-; VBITS_EQ_128-NEXT:    stp q18, q19, [x0, #64]
-; VBITS_EQ_128-NEXT:    stp q6, q4, [x0, #32]
-; VBITS_EQ_128-NEXT:    stp q2, q0, [x0]
-; VBITS_EQ_128-NEXT:    stp q16, q17, [x0, #96]
-; VBITS_EQ_128-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: umulh_v32i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: umulh_v32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %1 = zext <32 x i32> %op1 to <32 x i64>
@@ -2697,276 +1119,15 @@ define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v64i32:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_def_cfa_offset 80
-; VBITS_EQ_128-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_offset w29, -16
-; VBITS_EQ_128-NEXT:    .cfi_offset b8, -24
-; VBITS_EQ_128-NEXT:    .cfi_offset b9, -32
-; VBITS_EQ_128-NEXT:    .cfi_offset b10, -40
-; VBITS_EQ_128-NEXT:    .cfi_offset b11, -48
-; VBITS_EQ_128-NEXT:    .cfi_offset b12, -56
-; VBITS_EQ_128-NEXT:    .cfi_offset b13, -64
-; VBITS_EQ_128-NEXT:    .cfi_offset b14, -72
-; VBITS_EQ_128-NEXT:    .cfi_offset b15, -80
-; VBITS_EQ_128-NEXT:    addvl sp, sp, #-12
-; VBITS_EQ_128-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 96 * VG
-; VBITS_EQ_128-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xa0, 0x01, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 160 + 96 * VG
-; VBITS_EQ_128-NEXT:    ldp q4, q5, [x0, #96]
-; VBITS_EQ_128-NEXT:    ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT:    stp q5, q4, [sp, #-80]! // 32-byte Folded Spill
-; VBITS_EQ_128-NEXT:    ldp q0, q2, [x0, #48]
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ldr q1, [x0, #32]
-; VBITS_EQ_128-NEXT:    ldr q3, [x0, #80]
-; VBITS_EQ_128-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    ushll v1.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT:    stp q3, q2, [sp, #32] // 32-byte Folded Spill
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT:    str z1, [x8, #11, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    str z0, [x8, #10, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT:    str z0, [x8, #9, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q23, q26, [x0, #128]
-; VBITS_EQ_128-NEXT:    str z0, [x8, #8, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v4.4s, #0
-; VBITS_EQ_128-NEXT:    str z0, [x8, #7, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q25, q24, [x0, #160]
-; VBITS_EQ_128-NEXT:    str z0, [x8, #6, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v23.4s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v1.2d, v26.4s, #0
-; VBITS_EQ_128-NEXT:    str z0, [x8, #5, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll2 v27.2d, v25.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q30, q0, [x0, #192]
-; VBITS_EQ_128-NEXT:    str z1, [x8, #4, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll2 v9.2d, v24.4s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v12.2d, v30.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q31, q1, [x0, #224]
-; VBITS_EQ_128-NEXT:    ushll v11.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v8.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v10.2d, v31.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v15.2d, v31.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q29, q28, [x1, #224]
-; VBITS_EQ_128-NEXT:    ushll2 v18.2d, v1.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v31.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v2.2d, v29.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q14, q0, [x1, #192]
-; VBITS_EQ_128-NEXT:    ushll v1.2d, v28.2s, #0
-; VBITS_EQ_128-NEXT:    ushll v20.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v19.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v28.4s, #0
-; VBITS_EQ_128-NEXT:    mul z11.d, p0/m, z11.d, z20.d
-; VBITS_EQ_128-NEXT:    ldp q21, q22, [x0]
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z18.d
-; VBITS_EQ_128-NEXT:    ushll v18.2d, v29.2s, #0
-; VBITS_EQ_128-NEXT:    ushll v20.2d, v14.2s, #0
-; VBITS_EQ_128-NEXT:    ldp q4, q13, [x1, #160]
-; VBITS_EQ_128-NEXT:    ldp q5, q6, [x1, #128]
-; VBITS_EQ_128-NEXT:    ldp q7, q3, [x1, #96]
-; VBITS_EQ_128-NEXT:    str z0, [x8, #3, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ldp q17, q16, [x1, #64]
-; VBITS_EQ_128-NEXT:    movprfx z0, z31
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_128-NEXT:    str z0, [x8, #1, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    movprfx z0, z15
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z2.d
-; VBITS_EQ_128-NEXT:    ushll v1.2d, v30.2s, #0
-; VBITS_EQ_128-NEXT:    str z0, [x8, #2, mul vl] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ldp q2, q29, [x1, #32]
-; VBITS_EQ_128-NEXT:    movprfx z15, z10
-; VBITS_EQ_128-NEXT:    mul z15.d, p0/m, z15.d, z18.d
-; VBITS_EQ_128-NEXT:    movprfx z0, z8
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z19.d
-; VBITS_EQ_128-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v14.4s, #0
-; VBITS_EQ_128-NEXT:    ldp q19, q18, [x1]
-; VBITS_EQ_128-NEXT:    movprfx z10, z12
-; VBITS_EQ_128-NEXT:    mul z10.d, p0/m, z10.d, z0.d
-; VBITS_EQ_128-NEXT:    movprfx z8, z1
-; VBITS_EQ_128-NEXT:    mul z8.d, p0/m, z8.d, z20.d
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v13.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v12.2d, v24.2s, #0
-; VBITS_EQ_128-NEXT:    ushll v1.2d, v13.2s, #0
-; VBITS_EQ_128-NEXT:    mul z9.d, p0/m, z9.d, z0.d
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v4.4s, #0
-; VBITS_EQ_128-NEXT:    mul z12.d, p0/m, z12.d, z1.d
-; VBITS_EQ_128-NEXT:    ushll v1.2d, v4.2s, #0
-; VBITS_EQ_128-NEXT:    mul z27.d, p0/m, z27.d, z0.d
-; VBITS_EQ_128-NEXT:    ushll v20.2d, v25.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z13, z20
-; VBITS_EQ_128-NEXT:    mul z13.d, p0/m, z13.d, z1.d
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v6.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v1.2d, v6.2s, #0
-; VBITS_EQ_128-NEXT:    ldr z6, [x8, #4, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    movprfx z14, z6
-; VBITS_EQ_128-NEXT:    mul z14.d, p0/m, z14.d, z0.d
-; VBITS_EQ_128-NEXT:    ushll v4.2d, v26.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z30, z4
-; VBITS_EQ_128-NEXT:    mul z30.d, p0/m, z30.d, z1.d
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v5.4s, #0
-; VBITS_EQ_128-NEXT:    ldr z4, [x8, #5, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll v1.2d, v5.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z31, z4
-; VBITS_EQ_128-NEXT:    mul z31.d, p0/m, z31.d, z0.d
-; VBITS_EQ_128-NEXT:    ushll v6.2d, v23.2s, #0
-; VBITS_EQ_128-NEXT:    ldr q4, [sp] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v3.4s, #0
-; VBITS_EQ_128-NEXT:    movprfx z28, z6
-; VBITS_EQ_128-NEXT:    mul z28.d, p0/m, z28.d, z1.d
-; VBITS_EQ_128-NEXT:    ushll v1.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT:    ldr z3, [x8, #6, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    movprfx z23, z3
-; VBITS_EQ_128-NEXT:    mul z23.d, p0/m, z23.d, z0.d
-; VBITS_EQ_128-NEXT:    ushll v5.2d, v4.2s, #0
-; VBITS_EQ_128-NEXT:    ldr q3, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    movprfx z20, z5
-; VBITS_EQ_128-NEXT:    mul z20.d, p0/m, z20.d, z1.d
-; VBITS_EQ_128-NEXT:    ldr z1, [x8, #7, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v7.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v4.2d, v7.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z7, z1
-; VBITS_EQ_128-NEXT:    mul z7.d, p0/m, z7.d, z0.d
-; VBITS_EQ_128-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ushll v3.2d, v3.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z6, z3
-; VBITS_EQ_128-NEXT:    mul z6.d, p0/m, z6.d, z4.d
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v16.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v5.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT:    ldr z1, [x8, #8, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    movprfx z26, z1
-; VBITS_EQ_128-NEXT:    mul z26.d, p0/m, z26.d, z0.d
-; VBITS_EQ_128-NEXT:    ldr q1, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll v3.2d, v16.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z24, z5
-; VBITS_EQ_128-NEXT:    mul z24.d, p0/m, z24.d, z3.d
-; VBITS_EQ_128-NEXT:    ushll v16.2d, v1.2s, #0
-; VBITS_EQ_128-NEXT:    ldr z1, [x8, #9, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v17.4s, #0
-; VBITS_EQ_128-NEXT:    movprfx z25, z1
-; VBITS_EQ_128-NEXT:    mul z25.d, p0/m, z25.d, z0.d
-; VBITS_EQ_128-NEXT:    ushll v5.2d, v17.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v29.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v17.2d, v29.2s, #0
-; VBITS_EQ_128-NEXT:    movprfx z29, z16
-; VBITS_EQ_128-NEXT:    mul z29.d, p0/m, z29.d, z5.d
-; VBITS_EQ_128-NEXT:    ldr z1, [x8, #10, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    movprfx z4, z1
-; VBITS_EQ_128-NEXT:    mul z4.d, p0/m, z4.d, z0.d
-; VBITS_EQ_128-NEXT:    ushll v5.2d, v22.2s, #0
-; VBITS_EQ_128-NEXT:    ldr z0, [x8, #11, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    ushll2 v16.2d, v22.4s, #0
-; VBITS_EQ_128-NEXT:    movprfx z22, z0
-; VBITS_EQ_128-NEXT:    mul z22.d, p0/m, z22.d, z17.d
-; VBITS_EQ_128-NEXT:    ldr q0, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ushll v1.2d, v2.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v2.2d, v2.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v17.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v0.2d, v0.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v3.2d, v18.2s, #0
-; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z17.d
-; VBITS_EQ_128-NEXT:    ushll2 v18.2d, v18.4s, #0
-; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z2.d
-; VBITS_EQ_128-NEXT:    movprfx z2, z5
-; VBITS_EQ_128-NEXT:    mul z2.d, p0/m, z2.d, z3.d
-; VBITS_EQ_128-NEXT:    mul z18.d, p0/m, z18.d, z16.d
-; VBITS_EQ_128-NEXT:    ushll2 v5.2d, v21.4s, #0
-; VBITS_EQ_128-NEXT:    ushll2 v16.2d, v19.4s, #0
-; VBITS_EQ_128-NEXT:    ushll v17.2d, v19.2s, #0
-; VBITS_EQ_128-NEXT:    mul z5.d, p0/m, z5.d, z16.d
-; VBITS_EQ_128-NEXT:    shrn v16.2s, v1.2d, #32
-; VBITS_EQ_128-NEXT:    ushll v3.2d, v21.2s, #0
-; VBITS_EQ_128-NEXT:    shrn v21.2s, v22.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v16.4s, v0.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v0.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT:    ldr z6, [x8, #1, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    shrn v1.2s, v20.2d, #32
-; VBITS_EQ_128-NEXT:    mul z17.d, p0/m, z17.d, z3.d
-; VBITS_EQ_128-NEXT:    shrn2 v21.4s, v4.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v0.4s, v7.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v3.2s, v13.2d, #32
-; VBITS_EQ_128-NEXT:    ldr z19, [x8, #3, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    shrn v4.2s, v12.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v6.2s, v6.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v7.2s, v15.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v1.4s, v23.2d, #32
-; VBITS_EQ_128-NEXT:    ldr z20, [x8, #2, mul vl] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add x8, sp, #80
-; VBITS_EQ_128-NEXT:    shrn2 v3.4s, v27.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v4.4s, v9.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v6.4s, v19.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v19.2s, v11.2d, #32
-; VBITS_EQ_128-NEXT:    ldr z22, [x8] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    stp q16, q21, [x0, #32]
-; VBITS_EQ_128-NEXT:    shrn2 v7.4s, v20.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v20.2s, v8.2d, #32
-; VBITS_EQ_128-NEXT:    stp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    shrn v0.2s, v2.2d, #32
-; VBITS_EQ_128-NEXT:    stp q3, q4, [x0, #160]
-; VBITS_EQ_128-NEXT:    shrn v3.2s, v24.2d, #32
-; VBITS_EQ_128-NEXT:    stp q7, q6, [x0, #224]
-; VBITS_EQ_128-NEXT:    shrn v6.2s, v30.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v7.2s, v28.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v4.2s, v29.2d, #32
-; VBITS_EQ_128-NEXT:    shrn v1.2s, v17.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v19.4s, v22.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v20.4s, v10.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v6.4s, v14.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v7.4s, v31.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v3.4s, v26.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v4.4s, v25.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v0.4s, v18.2d, #32
-; VBITS_EQ_128-NEXT:    shrn2 v1.4s, v5.2d, #32
-; VBITS_EQ_128-NEXT:    stp q7, q6, [x0, #128]
-; VBITS_EQ_128-NEXT:    stp q4, q3, [x0, #64]
-; VBITS_EQ_128-NEXT:    stp q1, q0, [x0]
-; VBITS_EQ_128-NEXT:    stp q20, q19, [x0, #192]
-; VBITS_EQ_128-NEXT:    addvl sp, sp, #12
-; VBITS_EQ_128-NEXT:    add sp, sp, #80
-; VBITS_EQ_128-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: umulh_v64i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: umulh_v64i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %1 = zext <64 x i32> %op1 to <64 x i64>
@@ -2979,25 +1140,15 @@ define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
-define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
-; VBITS_EQ_128-LABEL: umulh_v1i64:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    // kill: def $d1 killed $d1 def $q1
-; VBITS_EQ_128-NEXT:    // kill: def $d0 killed $d0 def $q0
-; VBITS_EQ_128-NEXT:    fmov x8, d0
-; VBITS_EQ_128-NEXT:    fmov x9, d1
-; VBITS_EQ_128-NEXT:    umulh x8, x8, x9
-; VBITS_EQ_128-NEXT:    fmov d0, x8
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: umulh_v1i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl1
-; VBITS_GE_256-NEXT:    // kill: def $d1 killed $d1 def $z1
-; VBITS_GE_256-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; VBITS_GE_256-NEXT:    ret
+define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %1 = zext <1 x i64> %op1 to <1 x i128>
   %2 = zext <1 x i64> %op2 to <1 x i128>
   %mul = mul <1 x i128> %1, %2
@@ -3007,28 +1158,15 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 }
 
 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
-define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
-; VBITS_EQ_128-LABEL: umulh_v2i64:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    mov x8, v0.d[1]
-; VBITS_EQ_128-NEXT:    fmov x10, d0
-; VBITS_EQ_128-NEXT:    mov x9, v1.d[1]
-; VBITS_EQ_128-NEXT:    fmov x11, d1
-; VBITS_EQ_128-NEXT:    umulh x10, x10, x11
-; VBITS_EQ_128-NEXT:    umulh x8, x8, x9
-; VBITS_EQ_128-NEXT:    fmov d0, x10
-; VBITS_EQ_128-NEXT:    fmov d1, x8
-; VBITS_EQ_128-NEXT:    mov v0.d[1], v1.d[0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: umulh_v2i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl2
-; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_256-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
-; VBITS_GE_256-NEXT:    ret
+define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %1 = zext <2 x i64> %op1 to <2 x i128>
   %2 = zext <2 x i64> %op2 to <2 x i128>
   %mul = mul <2 x i128> %1, %2
@@ -3037,40 +1175,15 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
   ret <2 x i64> %res
 }
 
-define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v4i64:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0]
-; VBITS_EQ_128-NEXT:    mov x10, v0.d[1]
-; VBITS_EQ_128-NEXT:    fmov x11, d0
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x1]
-; VBITS_EQ_128-NEXT:    mov x8, v1.d[1]
-; VBITS_EQ_128-NEXT:    fmov x9, d1
-; VBITS_EQ_128-NEXT:    mov x12, v2.d[1]
-; VBITS_EQ_128-NEXT:    fmov x13, d2
-; VBITS_EQ_128-NEXT:    mov x14, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x15, d3
-; VBITS_EQ_128-NEXT:    umulh x11, x11, x13
-; VBITS_EQ_128-NEXT:    umulh x10, x10, x12
-; VBITS_EQ_128-NEXT:    umulh x9, x9, x15
-; VBITS_EQ_128-NEXT:    umulh x8, x8, x14
-; VBITS_EQ_128-NEXT:    fmov d0, x11
-; VBITS_EQ_128-NEXT:    fmov d1, x10
-; VBITS_EQ_128-NEXT:    fmov d2, x9
-; VBITS_EQ_128-NEXT:    fmov d3, x8
-; VBITS_EQ_128-NEXT:    mov v0.d[1], v1.d[0]
-; VBITS_EQ_128-NEXT:    mov v2.d[1], v3.d[0]
-; VBITS_EQ_128-NEXT:    stp q0, q2, [x0]
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_256-LABEL: umulh_v4i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
+define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: umulh_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %1 = zext <4 x i64> %op1 to <4 x i128>
@@ -3083,60 +1196,69 @@ define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @umulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v8i64:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0, #32]
-; VBITS_EQ_128-NEXT:    fmov x14, d0
-; VBITS_EQ_128-NEXT:    mov x13, v0.d[1]
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x0]
-; VBITS_EQ_128-NEXT:    mov x11, v1.d[1]
-; VBITS_EQ_128-NEXT:    fmov x12, d1
-; VBITS_EQ_128-NEXT:    mov x10, v2.d[1]
-; VBITS_EQ_128-NEXT:    ldp q4, q5, [x1, #32]
-; VBITS_EQ_128-NEXT:    mov x8, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x9, d3
-; VBITS_EQ_128-NEXT:    fmov x17, d4
-; VBITS_EQ_128-NEXT:    mov x15, v4.d[1]
-; VBITS_EQ_128-NEXT:    ldp q3, q1, [x1]
-; VBITS_EQ_128-NEXT:    fmov x1, d5
-; VBITS_EQ_128-NEXT:    umulh x14, x14, x17
-; VBITS_EQ_128-NEXT:    mov x18, v5.d[1]
-; VBITS_EQ_128-NEXT:    umulh x13, x13, x15
-; VBITS_EQ_128-NEXT:    fmov x15, d2
-; VBITS_EQ_128-NEXT:    umulh x12, x12, x1
-; VBITS_EQ_128-NEXT:    mov x1, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x17, d1
-; VBITS_EQ_128-NEXT:    umulh x11, x11, x18
-; VBITS_EQ_128-NEXT:    mov x16, v1.d[1]
-; VBITS_EQ_128-NEXT:    fmov d2, x13
-; VBITS_EQ_128-NEXT:    fmov d5, x12
-; VBITS_EQ_128-NEXT:    umulh x9, x9, x17
-; VBITS_EQ_128-NEXT:    fmov x17, d3
-; VBITS_EQ_128-NEXT:    umulh x10, x10, x1
-; VBITS_EQ_128-NEXT:    fmov d3, x14
-; VBITS_EQ_128-NEXT:    umulh x8, x8, x16
-; VBITS_EQ_128-NEXT:    fmov d4, x11
-; VBITS_EQ_128-NEXT:    umulh x15, x15, x17
-; VBITS_EQ_128-NEXT:    fmov d1, x9
-; VBITS_EQ_128-NEXT:    fmov d6, x10
-; VBITS_EQ_128-NEXT:    fmov d0, x8
-; VBITS_EQ_128-NEXT:    fmov d7, x15
-; VBITS_EQ_128-NEXT:    mov v3.d[1], v2.d[0]
-; VBITS_EQ_128-NEXT:    mov v5.d[1], v4.d[0]
-; VBITS_EQ_128-NEXT:    mov v7.d[1], v6.d[0]
-; VBITS_EQ_128-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_EQ_128-NEXT:    stp q3, q5, [x0, #32]
-; VBITS_EQ_128-NEXT:    stp q7, q1, [x0]
-; VBITS_EQ_128-NEXT:    ret
+; VBITS_GE_256-LABEL: umulh_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ptrue p1.d, vl2
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov x9, v0.d[1]
+; VBITS_GE_256-NEXT:    fmov x10, d0
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    fmov x17, d2
+; VBITS_GE_256-NEXT:    mov x13, v2.d[1]
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    mov x14, v0.d[1]
+; VBITS_GE_256-NEXT:    mov x18, v2.d[1]
+; VBITS_GE_256-NEXT:    umulh x10, x10, x17
+; VBITS_GE_256-NEXT:    mov x11, v1.d[1]
+; VBITS_GE_256-NEXT:    fmov x12, d1
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    mov x2, v3.d[1]
+; VBITS_GE_256-NEXT:    fmov x3, d3
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
+; VBITS_GE_256-NEXT:    umulh x9, x9, x13
+; VBITS_GE_256-NEXT:    mov x13, v1.d[1]
+; VBITS_GE_256-NEXT:    umulh x14, x14, x18
+; VBITS_GE_256-NEXT:    mov x18, v3.d[1]
+; VBITS_GE_256-NEXT:    umulh x12, x12, x3
+; VBITS_GE_256-NEXT:    fmov x15, d0
+; VBITS_GE_256-NEXT:    fmov x16, d1
+; VBITS_GE_256-NEXT:    fmov x1, d2
+; VBITS_GE_256-NEXT:    fmov x17, d3
+; VBITS_GE_256-NEXT:    fmov d0, x9
+; VBITS_GE_256-NEXT:    fmov d1, x10
+; VBITS_GE_256-NEXT:    umulh x9, x11, x2
+; VBITS_GE_256-NEXT:    umulh x15, x15, x1
+; VBITS_GE_256-NEXT:    fmov d4, x12
+; VBITS_GE_256-NEXT:    umulh x16, x16, x17
+; VBITS_GE_256-NEXT:    umulh x10, x13, x18
+; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    fmov d0, x14
+; VBITS_GE_256-NEXT:    fmov d2, x15
+; VBITS_GE_256-NEXT:    fmov d3, x9
+; VBITS_GE_256-NEXT:    fmov d6, x16
+; VBITS_GE_256-NEXT:    fmov d5, x10
+; VBITS_GE_256-NEXT:    mov v2.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    mov v4.d[1], v3.d[0]
+; VBITS_GE_256-NEXT:    mov v6.d[1], v5.d[0]
+; VBITS_GE_256-NEXT:    splice z1.d, p1, z1.d, z2.d
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    splice z4.d, p1, z4.d, z6.d
+; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; VBITS_GE_1024-LABEL: umulh_v8i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+; VBITS_GE_512-LABEL: umulh_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %1 = zext <8 x i64> %op1 to <8 x i128>
@@ -3148,111 +1270,15 @@ define void @umulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v16i64:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    str x21, [sp, #-32]! // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_def_cfa_offset 32
-; VBITS_EQ_128-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_offset w19, -8
-; VBITS_EQ_128-NEXT:    .cfi_offset w20, -16
-; VBITS_EQ_128-NEXT:    .cfi_offset w21, -32
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x0]
-; VBITS_EQ_128-NEXT:    mov x10, v2.d[1]
-; VBITS_EQ_128-NEXT:    fmov x11, d2
-; VBITS_EQ_128-NEXT:    ldp q4, q5, [x0, #32]
-; VBITS_EQ_128-NEXT:    mov x8, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x9, d3
-; VBITS_EQ_128-NEXT:    mov x14, v4.d[1]
-; VBITS_EQ_128-NEXT:    fmov x15, d4
-; VBITS_EQ_128-NEXT:    ldp q0, q1, [x0, #96]
-; VBITS_EQ_128-NEXT:    mov x12, v5.d[1]
-; VBITS_EQ_128-NEXT:    fmov x13, d5
-; VBITS_EQ_128-NEXT:    fmov x5, d0
-; VBITS_EQ_128-NEXT:    mov x4, v0.d[1]
-; VBITS_EQ_128-NEXT:    ldp q2, q3, [x0, #64]
-; VBITS_EQ_128-NEXT:    mov x3, v1.d[1]
-; VBITS_EQ_128-NEXT:    mov x18, v2.d[1]
-; VBITS_EQ_128-NEXT:    fmov x2, d2
-; VBITS_EQ_128-NEXT:    ldp q5, q6, [x1, #96]
-; VBITS_EQ_128-NEXT:    mov x16, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x17, d3
-; VBITS_EQ_128-NEXT:    fmov x19, d5
-; VBITS_EQ_128-NEXT:    mov x6, v5.d[1]
-; VBITS_EQ_128-NEXT:    ldp q4, q7, [x1, #64]
-; VBITS_EQ_128-NEXT:    mov x20, v6.d[1]
-; VBITS_EQ_128-NEXT:    fmov x21, d6
-; VBITS_EQ_128-NEXT:    umulh x5, x5, x19
-; VBITS_EQ_128-NEXT:    umulh x4, x4, x6
-; VBITS_EQ_128-NEXT:    mov x19, v4.d[1]
-; VBITS_EQ_128-NEXT:    fmov x6, d4
-; VBITS_EQ_128-NEXT:    umulh x3, x3, x20
-; VBITS_EQ_128-NEXT:    ldp q3, q16, [x1, #32]
-; VBITS_EQ_128-NEXT:    fmov x20, d7
-; VBITS_EQ_128-NEXT:    umulh x2, x2, x6
-; VBITS_EQ_128-NEXT:    umulh x18, x18, x19
-; VBITS_EQ_128-NEXT:    fmov d18, x4
-; VBITS_EQ_128-NEXT:    fmov d19, x5
-; VBITS_EQ_128-NEXT:    fmov d20, x3
-; VBITS_EQ_128-NEXT:    umulh x17, x17, x20
-; VBITS_EQ_128-NEXT:    fmov x19, d3
-; VBITS_EQ_128-NEXT:    fmov d23, x2
-; VBITS_EQ_128-NEXT:    ldp q2, q17, [x1]
-; VBITS_EQ_128-NEXT:    fmov x1, d1
-; VBITS_EQ_128-NEXT:    fmov x20, d16
-; VBITS_EQ_128-NEXT:    umulh x15, x15, x19
-; VBITS_EQ_128-NEXT:    fmov d22, x18
-; VBITS_EQ_128-NEXT:    mov v19.d[1], v18.d[0]
-; VBITS_EQ_128-NEXT:    umulh x1, x1, x21
-; VBITS_EQ_128-NEXT:    mov x21, v7.d[1]
-; VBITS_EQ_128-NEXT:    umulh x13, x13, x20
-; VBITS_EQ_128-NEXT:    mov x7, v17.d[1]
-; VBITS_EQ_128-NEXT:    mov x6, v2.d[1]
-; VBITS_EQ_128-NEXT:    mov x20, v16.d[1]
-; VBITS_EQ_128-NEXT:    umulh x16, x16, x21
-; VBITS_EQ_128-NEXT:    fmov x21, d2
-; VBITS_EQ_128-NEXT:    fmov x19, d17
-; VBITS_EQ_128-NEXT:    umulh x8, x8, x7
-; VBITS_EQ_128-NEXT:    umulh x10, x10, x6
-; VBITS_EQ_128-NEXT:    fmov d5, x13
-; VBITS_EQ_128-NEXT:    umulh x11, x11, x21
-; VBITS_EQ_128-NEXT:    fmov d7, x15
-; VBITS_EQ_128-NEXT:    mov x21, v3.d[1]
-; VBITS_EQ_128-NEXT:    umulh x9, x9, x19
-; VBITS_EQ_128-NEXT:    umulh x12, x12, x20
-; VBITS_EQ_128-NEXT:    fmov d0, x8
-; VBITS_EQ_128-NEXT:    fmov d2, x10
-; VBITS_EQ_128-NEXT:    fmov d16, x16
-; VBITS_EQ_128-NEXT:    fmov d3, x11
-; VBITS_EQ_128-NEXT:    fmov d17, x17
-; VBITS_EQ_128-NEXT:    umulh x14, x14, x21
-; VBITS_EQ_128-NEXT:    fmov d1, x9
-; VBITS_EQ_128-NEXT:    fmov d4, x12
-; VBITS_EQ_128-NEXT:    fmov d21, x1
-; VBITS_EQ_128-NEXT:    mov v23.d[1], v22.d[0]
-; VBITS_EQ_128-NEXT:    mov v17.d[1], v16.d[0]
-; VBITS_EQ_128-NEXT:    fmov d6, x14
-; VBITS_EQ_128-NEXT:    mov v21.d[1], v20.d[0]
-; VBITS_EQ_128-NEXT:    mov v5.d[1], v4.d[0]
-; VBITS_EQ_128-NEXT:    mov v7.d[1], v6.d[0]
-; VBITS_EQ_128-NEXT:    stp q23, q17, [x0, #64]
-; VBITS_EQ_128-NEXT:    mov v3.d[1], v2.d[0]
-; VBITS_EQ_128-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_EQ_128-NEXT:    stp q19, q21, [x0, #96]
-; VBITS_EQ_128-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    stp q7, q5, [x0, #32]
-; VBITS_EQ_128-NEXT:    stp q3, q1, [x0]
-; VBITS_EQ_128-NEXT:    ldr x21, [sp], #32 // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: umulh_v16i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: umulh_v16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %1 = zext <16 x i64> %op1 to <16 x i128>
@@ -3264,237 +1290,15 @@ define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
-; VBITS_EQ_128-LABEL: umulh_v32i64:
-; VBITS_EQ_128:       // %bb.0:
-; VBITS_EQ_128-NEXT:    sub sp, sp, #224
-; VBITS_EQ_128-NEXT:    .cfi_def_cfa_offset 224
-; VBITS_EQ_128-NEXT:    stp d15, d14, [sp, #64] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d13, d12, [sp, #80] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d11, d10, [sp, #96] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp d9, d8, [sp, #112] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp x29, x30, [sp, #128] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp x28, x27, [sp, #144] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp x26, x25, [sp, #160] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp x24, x23, [sp, #176] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp x22, x21, [sp, #192] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    stp x20, x19, [sp, #208] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    .cfi_offset w19, -8
-; VBITS_EQ_128-NEXT:    .cfi_offset w20, -16
-; VBITS_EQ_128-NEXT:    .cfi_offset w21, -24
-; VBITS_EQ_128-NEXT:    .cfi_offset w22, -32
-; VBITS_EQ_128-NEXT:    .cfi_offset w23, -40
-; VBITS_EQ_128-NEXT:    .cfi_offset w24, -48
-; VBITS_EQ_128-NEXT:    .cfi_offset w25, -56
-; VBITS_EQ_128-NEXT:    .cfi_offset w26, -64
-; VBITS_EQ_128-NEXT:    .cfi_offset w27, -72
-; VBITS_EQ_128-NEXT:    .cfi_offset w28, -80
-; VBITS_EQ_128-NEXT:    .cfi_offset w30, -88
-; VBITS_EQ_128-NEXT:    .cfi_offset w29, -96
-; VBITS_EQ_128-NEXT:    .cfi_offset b8, -104
-; VBITS_EQ_128-NEXT:    .cfi_offset b9, -112
-; VBITS_EQ_128-NEXT:    .cfi_offset b10, -120
-; VBITS_EQ_128-NEXT:    .cfi_offset b11, -128
-; VBITS_EQ_128-NEXT:    .cfi_offset b12, -136
-; VBITS_EQ_128-NEXT:    .cfi_offset b13, -144
-; VBITS_EQ_128-NEXT:    .cfi_offset b14, -152
-; VBITS_EQ_128-NEXT:    .cfi_offset b15, -160
-; VBITS_EQ_128-NEXT:    ldp q3, q2, [x0]
-; VBITS_EQ_128-NEXT:    mov x8, v3.d[1]
-; VBITS_EQ_128-NEXT:    ldp q5, q4, [x0, #64]
-; VBITS_EQ_128-NEXT:    fmov x2, d2
-; VBITS_EQ_128-NEXT:    str x8, [sp, #16] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    fmov x8, d3
-; VBITS_EQ_128-NEXT:    mov x6, v5.d[1]
-; VBITS_EQ_128-NEXT:    fmov x7, d5
-; VBITS_EQ_128-NEXT:    str x8, [sp] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    ldp q6, q3, [x0, #96]
-; VBITS_EQ_128-NEXT:    mov x20, v4.d[1]
-; VBITS_EQ_128-NEXT:    fmov x21, d4
-; VBITS_EQ_128-NEXT:    mov x23, v6.d[1]
-; VBITS_EQ_128-NEXT:    fmov x24, d6
-; VBITS_EQ_128-NEXT:    ldp q16, q4, [x0, #128]
-; VBITS_EQ_128-NEXT:    mov x26, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x27, d3
-; VBITS_EQ_128-NEXT:    mov x28, v16.d[1]
-; VBITS_EQ_128-NEXT:    fmov x25, d16
-; VBITS_EQ_128-NEXT:    ldp q7, q5, [x0, #224]
-; VBITS_EQ_128-NEXT:    mov x22, v4.d[1]
-; VBITS_EQ_128-NEXT:    fmov x19, d4
-; VBITS_EQ_128-NEXT:    mov x13, v7.d[1]
-; VBITS_EQ_128-NEXT:    fmov x11, d7
-; VBITS_EQ_128-NEXT:    ldp q17, q6, [x0, #192]
-; VBITS_EQ_128-NEXT:    mov x12, v5.d[1]
-; VBITS_EQ_128-NEXT:    fmov x10, d5
-; VBITS_EQ_128-NEXT:    mov x17, v17.d[1]
-; VBITS_EQ_128-NEXT:    fmov x16, d17
-; VBITS_EQ_128-NEXT:    ldp q18, q3, [x0, #160]
-; VBITS_EQ_128-NEXT:    mov x15, v6.d[1]
-; VBITS_EQ_128-NEXT:    fmov x14, d6
-; VBITS_EQ_128-NEXT:    mov x5, v18.d[1]
-; VBITS_EQ_128-NEXT:    fmov x4, d18
-; VBITS_EQ_128-NEXT:    ldp q19, q16, [x1, #224]
-; VBITS_EQ_128-NEXT:    mov x29, v3.d[1]
-; VBITS_EQ_128-NEXT:    fmov x18, d3
-; VBITS_EQ_128-NEXT:    fmov x8, d19
-; VBITS_EQ_128-NEXT:    mov x9, v19.d[1]
-; VBITS_EQ_128-NEXT:    ldp q21, q20, [x1, #192]
-; VBITS_EQ_128-NEXT:    mov x30, v16.d[1]
-; VBITS_EQ_128-NEXT:    umulh x8, x11, x8
-; VBITS_EQ_128-NEXT:    umulh x11, x13, x9
-; VBITS_EQ_128-NEXT:    fmov x9, d21
-; VBITS_EQ_128-NEXT:    str x8, [sp, #48] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    ldp q22, q18, [x1, #160]
-; VBITS_EQ_128-NEXT:    ldp q24, q23, [x1, #128]
-; VBITS_EQ_128-NEXT:    ldp q25, q17, [x1, #96]
-; VBITS_EQ_128-NEXT:    ldp q26, q6, [x1, #64]
-; VBITS_EQ_128-NEXT:    ldp q4, q3, [x1, #32]
-; VBITS_EQ_128-NEXT:    ldp q7, q5, [x1]
-; VBITS_EQ_128-NEXT:    fmov x1, d16
-; VBITS_EQ_128-NEXT:    umulh x10, x10, x1
-; VBITS_EQ_128-NEXT:    mov x1, v20.d[1]
-; VBITS_EQ_128-NEXT:    ldp q1, q0, [x0, #32]
-; VBITS_EQ_128-NEXT:    str x10, [sp, #56] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    umulh x10, x12, x30
-; VBITS_EQ_128-NEXT:    mov x30, v21.d[1]
-; VBITS_EQ_128-NEXT:    fmov x3, d1
-; VBITS_EQ_128-NEXT:    str x10, [sp, #24] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    fmov x10, d20
-; VBITS_EQ_128-NEXT:    ldr x13, [sp, #16] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d13, d11, [sp, #48] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    umulh x8, x14, x10
-; VBITS_EQ_128-NEXT:    umulh x10, x15, x1
-; VBITS_EQ_128-NEXT:    fmov x15, d18
-; VBITS_EQ_128-NEXT:    umulh x14, x16, x9
-; VBITS_EQ_128-NEXT:    mov x9, v22.d[1]
-; VBITS_EQ_128-NEXT:    umulh x16, x17, x30
-; VBITS_EQ_128-NEXT:    stp x11, x8, [sp, #32] // 16-byte Folded Spill
-; VBITS_EQ_128-NEXT:    fmov x17, d22
-; VBITS_EQ_128-NEXT:    mov x8, v18.d[1]
-; VBITS_EQ_128-NEXT:    umulh x18, x18, x15
-; VBITS_EQ_128-NEXT:    mov x15, v23.d[1]
-; VBITS_EQ_128-NEXT:    str x10, [sp, #8] // 8-byte Folded Spill
-; VBITS_EQ_128-NEXT:    umulh x4, x4, x17
-; VBITS_EQ_128-NEXT:    fmov d8, x16
-; VBITS_EQ_128-NEXT:    mov x17, v24.d[1]
-; VBITS_EQ_128-NEXT:    umulh x5, x5, x9
-; VBITS_EQ_128-NEXT:    umulh x1, x29, x8
-; VBITS_EQ_128-NEXT:    fmov x8, d23
-; VBITS_EQ_128-NEXT:    fmov x9, d24
-; VBITS_EQ_128-NEXT:    umulh x22, x22, x15
-; VBITS_EQ_128-NEXT:    fmov x15, d17
-; VBITS_EQ_128-NEXT:    fmov d9, x14
-; VBITS_EQ_128-NEXT:    umulh x19, x19, x8
-; VBITS_EQ_128-NEXT:    ldr d14, [sp, #8] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    mov x8, v17.d[1]
-; VBITS_EQ_128-NEXT:    umulh x25, x25, x9
-; VBITS_EQ_128-NEXT:    mov x9, v25.d[1]
-; VBITS_EQ_128-NEXT:    umulh x28, x28, x17
-; VBITS_EQ_128-NEXT:    fmov x17, d25
-; VBITS_EQ_128-NEXT:    umulh x15, x27, x15
-; VBITS_EQ_128-NEXT:    mov x27, v6.d[1]
-; VBITS_EQ_128-NEXT:    ldr d15, [sp, #40] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    umulh x12, x26, x8
-; VBITS_EQ_128-NEXT:    fmov x26, d6
-; VBITS_EQ_128-NEXT:    umulh x17, x24, x17
-; VBITS_EQ_128-NEXT:    ldr x8, [sp] // 8-byte Folded Reload
-; VBITS_EQ_128-NEXT:    mov x24, v26.d[1]
-; VBITS_EQ_128-NEXT:    umulh x11, x23, x9
-; VBITS_EQ_128-NEXT:    fmov x23, d26
-; VBITS_EQ_128-NEXT:    umulh x21, x21, x26
-; VBITS_EQ_128-NEXT:    fmov x26, d0
-; VBITS_EQ_128-NEXT:    umulh x20, x20, x27
-; VBITS_EQ_128-NEXT:    fmov x27, d3
-; VBITS_EQ_128-NEXT:    fmov d20, x17
-; VBITS_EQ_128-NEXT:    umulh x7, x7, x23
-; VBITS_EQ_128-NEXT:    fmov x23, d4
-; VBITS_EQ_128-NEXT:    umulh x6, x6, x24
-; VBITS_EQ_128-NEXT:    fmov x24, d5
-; VBITS_EQ_128-NEXT:    umulh x26, x26, x27
-; VBITS_EQ_128-NEXT:    fmov x27, d7
-; VBITS_EQ_128-NEXT:    umulh x3, x3, x23
-; VBITS_EQ_128-NEXT:    fmov d19, x20
-; VBITS_EQ_128-NEXT:    mov x23, v2.d[1]
-; VBITS_EQ_128-NEXT:    umulh x2, x2, x24
-; VBITS_EQ_128-NEXT:    mov x24, v1.d[1]
-; VBITS_EQ_128-NEXT:    umulh x27, x8, x27
-; VBITS_EQ_128-NEXT:    mov x29, v0.d[1]
-; VBITS_EQ_128-NEXT:    mov x30, v7.d[1]
-; VBITS_EQ_128-NEXT:    mov x8, v5.d[1]
-; VBITS_EQ_128-NEXT:    mov x9, v4.d[1]
-; VBITS_EQ_128-NEXT:    mov x10, v3.d[1]
-; VBITS_EQ_128-NEXT:    ldp d10, d12, [sp, #24] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    umulh x30, x13, x30
-; VBITS_EQ_128-NEXT:    fmov d0, x27
-; VBITS_EQ_128-NEXT:    umulh x8, x23, x8
-; VBITS_EQ_128-NEXT:    fmov d2, x2
-; VBITS_EQ_128-NEXT:    umulh x9, x24, x9
-; VBITS_EQ_128-NEXT:    fmov d4, x3
-; VBITS_EQ_128-NEXT:    umulh x10, x29, x10
-; VBITS_EQ_128-NEXT:    fmov d6, x26
-; VBITS_EQ_128-NEXT:    mov v11.d[1], v10.d[0]
-; VBITS_EQ_128-NEXT:    fmov d1, x30
-; VBITS_EQ_128-NEXT:    mov v13.d[1], v12.d[0]
-; VBITS_EQ_128-NEXT:    mov v15.d[1], v14.d[0]
-; VBITS_EQ_128-NEXT:    mov v9.d[1], v8.d[0]
-; VBITS_EQ_128-NEXT:    fmov d3, x8
-; VBITS_EQ_128-NEXT:    fmov d5, x9
-; VBITS_EQ_128-NEXT:    fmov d7, x10
-; VBITS_EQ_128-NEXT:    fmov d17, x6
-; VBITS_EQ_128-NEXT:    fmov d16, x7
-; VBITS_EQ_128-NEXT:    fmov d18, x21
-; VBITS_EQ_128-NEXT:    fmov d21, x11
-; VBITS_EQ_128-NEXT:    fmov d22, x12
-; VBITS_EQ_128-NEXT:    fmov d23, x15
-; VBITS_EQ_128-NEXT:    fmov d24, x28
-; VBITS_EQ_128-NEXT:    fmov d25, x25
-; VBITS_EQ_128-NEXT:    fmov d26, x22
-; VBITS_EQ_128-NEXT:    fmov d27, x19
-; VBITS_EQ_128-NEXT:    fmov d28, x5
-; VBITS_EQ_128-NEXT:    fmov d29, x4
-; VBITS_EQ_128-NEXT:    fmov d30, x1
-; VBITS_EQ_128-NEXT:    fmov d31, x18
-; VBITS_EQ_128-NEXT:    mov v27.d[1], v26.d[0]
-; VBITS_EQ_128-NEXT:    stp q9, q15, [x0, #192]
-; VBITS_EQ_128-NEXT:    stp q13, q11, [x0, #224]
-; VBITS_EQ_128-NEXT:    mov v31.d[1], v30.d[0]
-; VBITS_EQ_128-NEXT:    mov v29.d[1], v28.d[0]
-; VBITS_EQ_128-NEXT:    mov v25.d[1], v24.d[0]
-; VBITS_EQ_128-NEXT:    mov v23.d[1], v22.d[0]
-; VBITS_EQ_128-NEXT:    mov v20.d[1], v21.d[0]
-; VBITS_EQ_128-NEXT:    mov v18.d[1], v19.d[0]
-; VBITS_EQ_128-NEXT:    stp q29, q31, [x0, #160]
-; VBITS_EQ_128-NEXT:    mov v16.d[1], v17.d[0]
-; VBITS_EQ_128-NEXT:    stp q25, q27, [x0, #128]
-; VBITS_EQ_128-NEXT:    mov v6.d[1], v7.d[0]
-; VBITS_EQ_128-NEXT:    mov v4.d[1], v5.d[0]
-; VBITS_EQ_128-NEXT:    stp q20, q23, [x0, #96]
-; VBITS_EQ_128-NEXT:    mov v2.d[1], v3.d[0]
-; VBITS_EQ_128-NEXT:    mov v0.d[1], v1.d[0]
-; VBITS_EQ_128-NEXT:    stp q16, q18, [x0, #64]
-; VBITS_EQ_128-NEXT:    ldp x20, x19, [sp, #208] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    stp q4, q6, [x0, #32]
-; VBITS_EQ_128-NEXT:    ldp x22, x21, [sp, #192] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    stp q0, q2, [x0]
-; VBITS_EQ_128-NEXT:    ldp x24, x23, [sp, #176] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp x26, x25, [sp, #160] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp x28, x27, [sp, #144] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp x29, x30, [sp, #128] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d9, d8, [sp, #112] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d11, d10, [sp, #96] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d13, d12, [sp, #80] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    ldp d15, d14, [sp, #64] // 16-byte Folded Reload
-; VBITS_EQ_128-NEXT:    add sp, sp, #224
-; VBITS_EQ_128-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: umulh_v32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: umulh_v32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %1 = zext <32 x i64> %op1 to <32 x i128>
@@ -3506,5 +1310,3 @@ define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
   ret void
 }
 attributes #0 = { "target-features"="+sve" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; VBITS_GE_512: {{.*}}

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
index 9ce3873af774..1e6684b9f0e7 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
@@ -1,328 +1,364 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; UADDV
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define i8 @uaddv_v8i8(<8 x i8> %a) #0 {
+define i8 @uaddv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uaddv_v8i8:
-; CHECK: addv b0, v0.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i8 @uaddv_v16i8(<16 x i8> %a) #0 {
+define i8 @uaddv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uaddv_v16i8:
-; CHECK: addv b0, v0.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
-define i8 @uaddv_v32i8(<32 x i8>* %a) #0 {
+define i8 @uaddv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uaddv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.b
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
   %op = load <32 x i8>, <32 x i8>* %a
   %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
 define i8 @uaddv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: uaddv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].b, [[LO]].b, [[HI]].b
-; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].b
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uaddv_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    add z0.b, z1.b, z0.b
+; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.b
+; VBITS_GE_256-NEXT:    fmov x0, d0
+; VBITS_GE_256-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: uaddv_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.b
+; VBITS_GE_512-NEXT:    fmov x0, d0
+; VBITS_GE_512-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, <64 x i8>* %a
   %res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
-define i8 @uaddv_v128i8(<128 x i8>* %a) #0 {
+define i8 @uaddv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: uaddv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.b
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
-define i8 @uaddv_v256i8(<256 x i8>* %a) #0 {
+define i8 @uaddv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: uaddv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.b
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
 ; Don't use SVE for 64-bit vectors.
-define i16 @uaddv_v4i16(<4 x i16> %a) #0 {
+define i16 @uaddv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uaddv_v4i16:
-; CHECK: addv h0, v0.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i16 @uaddv_v8i16(<8 x i16> %a) #0 {
+define i16 @uaddv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uaddv_v8i16:
-; CHECK: addv h0, v0.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
-define i16 @uaddv_v16i16(<16 x i16>* %a) #0 {
+define i16 @uaddv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uaddv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.h
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
 define i16 @uaddv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: uaddv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].h, [[LO]].h, [[HI]].h
-; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].h
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uaddv_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    add z0.h, z1.h, z0.h
+; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.h
+; VBITS_GE_256-NEXT:    fmov x0, d0
+; VBITS_GE_256-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: uaddv_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.h
+; VBITS_GE_512-NEXT:    fmov x0, d0
+; VBITS_GE_512-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
-define i16 @uaddv_v64i16(<64 x i16>* %a) #0 {
+define i16 @uaddv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: uaddv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.h
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
-define i16 @uaddv_v128i16(<128 x i16>* %a) #0 {
+define i16 @uaddv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: uaddv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.h
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
 ; Don't use SVE for 64-bit vectors.
-define i32 @uaddv_v2i32(<2 x i32> %a) #0 {
+define i32 @uaddv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uaddv_v2i32:
-; CHECK: addp v0.2s, v0.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i32 @uaddv_v4i32(<4 x i32> %a) #0 {
+define i32 @uaddv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uaddv_v4i32:
-; CHECK: addv s0, v0.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
-define i32 @uaddv_v8i32(<8 x i32>* %a) #0 {
+define i32 @uaddv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uaddv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
 define i32 @uaddv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: uaddv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].s, [[LO]].s, [[HI]].s
-; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].s
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uaddv_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    add z0.s, z1.s, z0.s
+; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.s
+; VBITS_GE_256-NEXT:    fmov x0, d0
+; VBITS_GE_256-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: uaddv_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.s
+; VBITS_GE_512-NEXT:    fmov x0, d0
+; VBITS_GE_512-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
-define i32 @uaddv_v32i32(<32 x i32>* %a) #0 {
+define i32 @uaddv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: uaddv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
-define i32 @uaddv_v64i32(<64 x i32>* %a) #0 {
+define i32 @uaddv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: uaddv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
 ; Nothing to do for single element vectors.
-define i64 @uaddv_v1i64(<1 x i64> %a) #0 {
+define i64 @uaddv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uaddv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i64 @uaddv_v2i64(<2 x i64> %a) #0 {
+define i64 @uaddv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uaddv_v2i64:
-; CHECK: addp d0, v0.2d
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
-define i64 @uaddv_v4i64(<4 x i64>* %a) #0 {
+define i64 @uaddv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uaddv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
 define i64 @uaddv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: uaddv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uaddv_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    add z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.d
+; VBITS_GE_256-NEXT:    fmov x0, d0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: uaddv_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.d
+; VBITS_GE_512-NEXT:    fmov x0, d0
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
-define i64 @uaddv_v16i64(<16 x i64>* %a) #0 {
+define i64 @uaddv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: uaddv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
-define i64 @uaddv_v32i64(<32 x i64>* %a) #0 {
+define i64 @uaddv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: uaddv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %op)
   ret i64 %res
@@ -333,306 +369,342 @@ define i64 @uaddv_v32i64(<32 x i64>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define i8 @smaxv_v8i8(<8 x i8> %a) #0 {
+define i8 @smaxv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v8i8:
-; CHECK: smaxv b0, v0.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smaxv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i8 @smaxv_v16i8(<16 x i8> %a) #0 {
+define i8 @smaxv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v16i8:
-; CHECK: smaxv b0, v0.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smaxv b0, v0.16b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
-define i8 @smaxv_v32i8(<32 x i8>* %a) #0 {
+define i8 @smaxv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    smaxv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i8>, <32 x i8>* %a
   %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
 define i8 @smaxv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: smaxv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
-; VBITS_EQ_256-DAG: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[MAX]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smaxv_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    smax z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT:    smaxv b0, p0, z0.b
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: smaxv_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    smaxv b0, p0, z0.b
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, <64 x i8>* %a
   %res = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
-define i8 @smaxv_v128i8(<128 x i8>* %a) #0 {
+define i8 @smaxv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: smaxv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    smaxv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %res = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
-define i8 @smaxv_v256i8(<256 x i8>* %a) #0 {
+define i8 @smaxv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: smaxv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    smaxv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %res = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
 ; Don't use SVE for 64-bit vectors.
-define i16 @smaxv_v4i16(<4 x i16> %a) #0 {
+define i16 @smaxv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v4i16:
-; CHECK: smaxv h0, v0.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smaxv h0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i16 @smaxv_v8i16(<8 x i16> %a) #0 {
+define i16 @smaxv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v8i16:
-; CHECK: smaxv h0, v0.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smaxv h0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
-define i16 @smaxv_v16i16(<16 x i16>* %a) #0 {
+define i16 @smaxv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    smaxv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
 define i16 @smaxv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: smaxv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[MAX]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smaxv_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    smax z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT:    smaxv h0, p0, z0.h
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: smaxv_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    smaxv h0, p0, z0.h
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
-define i16 @smaxv_v64i16(<64 x i16>* %a) #0 {
+define i16 @smaxv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: smaxv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    smaxv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
-define i16 @smaxv_v128i16(<128 x i16>* %a) #0 {
+define i16 @smaxv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: smaxv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    smaxv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
 ; Don't use SVE for 64-bit vectors.
-define i32 @smaxv_v2i32(<2 x i32> %a) #0 {
+define i32 @smaxv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v2i32:
-; CHECK: smaxp v0.2s, v0.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smaxp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i32 @smaxv_v4i32(<4 x i32> %a) #0 {
+define i32 @smaxv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v4i32:
-; CHECK: smaxv s0, v0.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smaxv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
-define i32 @smaxv_v8i32(<8 x i32>* %a) #0 {
+define i32 @smaxv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    smaxv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
 define i32 @smaxv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: smaxv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[MAX]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smaxv_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    smax z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    smaxv s0, p0, z0.s
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: smaxv_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    smaxv s0, p0, z0.s
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
-define i32 @smaxv_v32i32(<32 x i32>* %a) #0 {
+define i32 @smaxv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: smaxv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    smaxv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
-define i32 @smaxv_v64i32(<64 x i32>* %a) #0 {
+define i32 @smaxv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: smaxv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    smaxv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
 ; Nothing to do for single element vectors.
-define i64 @smaxv_v1i64(<1 x i64> %a) #0 {
+define i64 @smaxv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
 ; No NEON 64-bit vector SMAXV support. Use SVE.
-define i64 @smaxv_v2i64(<2 x i64> %a) #0 {
+define i64 @smaxv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    smaxv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
-define i64 @smaxv_v4i64(<4 x i64>* %a) #0 {
+define i64 @smaxv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    smaxv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
 define i64 @smaxv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: smaxv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[MAX]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: smaxv_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_256-NEXT:    smaxv d0, p0, z0.d
+; VBITS_GE_256-NEXT:    fmov x0, d0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: smaxv_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    smaxv d0, p0, z0.d
+; VBITS_GE_512-NEXT:    fmov x0, d0
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
-define i64 @smaxv_v16i64(<16 x i64>* %a) #0 {
+define i64 @smaxv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: smaxv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    smaxv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
-define i64 @smaxv_v32i64(<32 x i64>* %a) #0 {
+define i64 @smaxv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: smaxv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    smaxv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %op)
   ret i64 %res
@@ -643,306 +715,342 @@ define i64 @smaxv_v32i64(<32 x i64>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define i8 @sminv_v8i8(<8 x i8> %a) #0 {
+define i8 @sminv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v8i8:
-; CHECK: sminv b0, v0.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sminv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i8 @sminv_v16i8(<16 x i8> %a) #0 {
+define i8 @sminv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v16i8:
-; CHECK: sminv b0, v0.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sminv b0, v0.16b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
-define i8 @sminv_v32i8(<32 x i8>* %a) #0 {
+define i8 @sminv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    sminv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i8>, <32 x i8>* %a
   %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
 define i8 @sminv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: sminv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
-; VBITS_EQ_256-DAG: sminv b[[REDUCE:[0-9]+]], [[PG]], [[MIN]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: sminv_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    smin z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT:    sminv b0, p0, z0.b
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sminv_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    sminv b0, p0, z0.b
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, <64 x i8>* %a
   %res = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
-define i8 @sminv_v128i8(<128 x i8>* %a) #0 {
+define i8 @sminv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sminv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    sminv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %res = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
-define i8 @sminv_v256i8(<256 x i8>* %a) #0 {
+define i8 @sminv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sminv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    sminv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %res = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
 ; Don't use SVE for 64-bit vectors.
-define i16 @sminv_v4i16(<4 x i16> %a) #0 {
+define i16 @sminv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v4i16:
-; CHECK: sminv h0, v0.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sminv h0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i16 @sminv_v8i16(<8 x i16> %a) #0 {
+define i16 @sminv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v8i16:
-; CHECK: sminv h0, v0.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sminv h0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
-define i16 @sminv_v16i16(<16 x i16>* %a) #0 {
+define i16 @sminv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    sminv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
 define i16 @sminv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: sminv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: sminv h[[REDUCE:[0-9]+]], [[PG]], [[MIN]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: sminv_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    smin z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT:    sminv h0, p0, z0.h
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sminv_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    sminv h0, p0, z0.h
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
-define i16 @sminv_v64i16(<64 x i16>* %a) #0 {
+define i16 @sminv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sminv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    sminv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
-define i16 @sminv_v128i16(<128 x i16>* %a) #0 {
+define i16 @sminv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sminv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    sminv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
 ; Don't use SVE for 64-bit vectors.
-define i32 @sminv_v2i32(<2 x i32> %a) #0 {
+define i32 @sminv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v2i32:
-; CHECK: minp v0.2s, v0.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sminp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i32 @sminv_v4i32(<4 x i32> %a) #0 {
+define i32 @sminv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v4i32:
-; CHECK: sminv s0, v0.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sminv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
-define i32 @sminv_v8i32(<8 x i32>* %a) #0 {
+define i32 @sminv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    sminv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
 define i32 @sminv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: sminv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: sminv [[REDUCE:s[0-9]+]], [[PG]], [[MIN]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: sminv_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    smin z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    sminv s0, p0, z0.s
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sminv_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    sminv s0, p0, z0.s
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
-define i32 @sminv_v32i32(<32 x i32>* %a) #0 {
+define i32 @sminv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sminv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    sminv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
-define i32 @sminv_v64i32(<64 x i32>* %a) #0 {
+define i32 @sminv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sminv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    sminv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
 ; Nothing to do for single element vectors.
-define i64 @sminv_v1i64(<1 x i64> %a) #0 {
+define i64 @sminv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
 ; No NEON 64-bit vector SMINV support. Use SVE.
-define i64 @sminv_v2i64(<2 x i64> %a) #0 {
+define i64 @sminv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    sminv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
-define i64 @sminv_v4i64(<4 x i64>* %a) #0 {
+define i64 @sminv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    sminv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
 define i64 @sminv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: sminv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: sminv [[REDUCE:d[0-9]+]], [[PG]], [[MIN]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: sminv_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    smin z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_256-NEXT:    sminv d0, p0, z0.d
+; VBITS_GE_256-NEXT:    fmov x0, d0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: sminv_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    sminv d0, p0, z0.d
+; VBITS_GE_512-NEXT:    fmov x0, d0
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
-define i64 @sminv_v16i64(<16 x i64>* %a) #0 {
+define i64 @sminv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: sminv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    sminv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
-define i64 @sminv_v32i64(<32 x i64>* %a) #0 {
+define i64 @sminv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sminv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    sminv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %op)
   ret i64 %res
@@ -953,306 +1061,342 @@ define i64 @sminv_v32i64(<32 x i64>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define i8 @umaxv_v8i8(<8 x i8> %a) #0 {
+define i8 @umaxv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v8i8:
-; CHECK: umaxv b0, v0.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umaxv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i8 @umaxv_v16i8(<16 x i8> %a) #0 {
+define i8 @umaxv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v16i8:
-; CHECK: umaxv b0, v0.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umaxv b0, v0.16b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
-define i8 @umaxv_v32i8(<32 x i8>* %a) #0 {
+define i8 @umaxv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    umaxv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i8>, <32 x i8>* %a
   %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
 define i8 @umaxv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: umaxv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
-; VBITS_EQ_256-DAG: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[MAX]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umaxv_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    umax z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT:    umaxv b0, p0, z0.b
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: umaxv_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    umaxv b0, p0, z0.b
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, <64 x i8>* %a
   %res = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
-define i8 @umaxv_v128i8(<128 x i8>* %a) #0 {
+define i8 @umaxv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: umaxv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    umaxv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %res = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
-define i8 @umaxv_v256i8(<256 x i8>* %a) #0 {
+define i8 @umaxv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: umaxv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    umaxv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %res = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
 ; Don't use SVE for 64-bit vectors.
-define i16 @umaxv_v4i16(<4 x i16> %a) #0 {
+define i16 @umaxv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v4i16:
-; CHECK: umaxv h0, v0.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umaxv h0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i16 @umaxv_v8i16(<8 x i16> %a) #0 {
+define i16 @umaxv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v8i16:
-; CHECK: umaxv h0, v0.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umaxv h0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
-define i16 @umaxv_v16i16(<16 x i16>* %a) #0 {
+define i16 @umaxv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    umaxv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
 define i16 @umaxv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: umaxv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[MAX]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umaxv_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    umax z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT:    umaxv h0, p0, z0.h
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: umaxv_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    umaxv h0, p0, z0.h
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
-define i16 @umaxv_v64i16(<64 x i16>* %a) #0 {
+define i16 @umaxv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: umaxv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    umaxv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
-define i16 @umaxv_v128i16(<128 x i16>* %a) #0 {
+define i16 @umaxv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: umaxv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    umaxv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
 ; Don't use SVE for 64-bit vectors.
-define i32 @umaxv_v2i32(<2 x i32> %a) #0 {
+define i32 @umaxv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v2i32:
-; CHECK: umaxp v0.2s, v0.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umaxp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i32 @umaxv_v4i32(<4 x i32> %a) #0 {
+define i32 @umaxv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v4i32:
-; CHECK: umaxv s0, v0.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
-define i32 @umaxv_v8i32(<8 x i32>* %a) #0 {
+define i32 @umaxv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    umaxv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
 define i32 @umaxv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: umaxv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[MAX]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umaxv_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    umax z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    umaxv s0, p0, z0.s
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: umaxv_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    umaxv s0, p0, z0.s
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
-define i32 @umaxv_v32i32(<32 x i32>* %a) #0 {
+define i32 @umaxv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: umaxv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    umaxv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
-define i32 @umaxv_v64i32(<64 x i32>* %a) #0 {
+define i32 @umaxv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: umaxv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    umaxv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
 ; Nothing to do for single element vectors.
-define i64 @umaxv_v1i64(<1 x i64> %a) #0 {
+define i64 @umaxv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
 ; No NEON 64-bit vector UMAXV support. Use SVE.
-define i64 @umaxv_v2i64(<2 x i64> %a) #0 {
+define i64 @umaxv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    umaxv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
-define i64 @umaxv_v4i64(<4 x i64>* %a) #0 {
+define i64 @umaxv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    umaxv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
 define i64 @umaxv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: umaxv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[MAX]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: umaxv_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    umax z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_256-NEXT:    umaxv d0, p0, z0.d
+; VBITS_GE_256-NEXT:    fmov x0, d0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: umaxv_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    umaxv d0, p0, z0.d
+; VBITS_GE_512-NEXT:    fmov x0, d0
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
-define i64 @umaxv_v16i64(<16 x i64>* %a) #0 {
+define i64 @umaxv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: umaxv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    umaxv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
-define i64 @umaxv_v32i64(<32 x i64>* %a) #0 {
+define i64 @umaxv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: umaxv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    umaxv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %op)
   ret i64 %res
@@ -1263,306 +1407,342 @@ define i64 @umaxv_v32i64(<32 x i64>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define i8 @uminv_v8i8(<8 x i8> %a) #0 {
+define i8 @uminv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v8i8:
-; CHECK: uminv b0, v0.8b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uminv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i8 @uminv_v16i8(<16 x i8> %a) #0 {
+define i8 @uminv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v16i8:
-; CHECK: uminv b0, v0.16b
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uminv b0, v0.16b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
-define i8 @uminv_v32i8(<32 x i8>* %a) #0 {
+define i8 @uminv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    uminv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i8>, <32 x i8>* %a
   %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
 define i8 @uminv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: uminv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
-; VBITS_EQ_256-DAG: uminv b[[REDUCE:[0-9]+]], [[PG]], [[MIN]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uminv_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    umin z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT:    uminv b0, p0, z0.b
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: uminv_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    uminv b0, p0, z0.b
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, <64 x i8>* %a
   %res = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
-define i8 @uminv_v128i8(<128 x i8>* %a) #0 {
+define i8 @uminv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: uminv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    uminv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %res = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
-define i8 @uminv_v256i8(<256 x i8>* %a) #0 {
+define i8 @uminv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: uminv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    uminv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %res = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
 ; Don't use SVE for 64-bit vectors.
-define i16 @uminv_v4i16(<4 x i16> %a) #0 {
+define i16 @uminv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v4i16:
-; CHECK: uminv h0, v0.4h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uminv h0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i16 @uminv_v8i16(<8 x i16> %a) #0 {
+define i16 @uminv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v8i16:
-; CHECK: uminv h0, v0.8h
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uminv h0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
-define i16 @uminv_v16i16(<16 x i16>* %a) #0 {
+define i16 @uminv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    uminv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
 define i16 @uminv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: uminv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
-; VBITS_EQ_256-DAG: uminv h[[REDUCE:[0-9]+]], [[PG]], [[MIN]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uminv_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    umin z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT:    uminv h0, p0, z0.h
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: uminv_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    uminv h0, p0, z0.h
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
-define i16 @uminv_v64i16(<64 x i16>* %a) #0 {
+define i16 @uminv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: uminv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    uminv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
-define i16 @uminv_v128i16(<128 x i16>* %a) #0 {
+define i16 @uminv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: uminv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    uminv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
 ; Don't use SVE for 64-bit vectors.
-define i32 @uminv_v2i32(<2 x i32> %a) #0 {
+define i32 @uminv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v2i32:
-; CHECK: minp v0.2s, v0.2s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uminp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define i32 @uminv_v4i32(<4 x i32> %a) #0 {
+define i32 @uminv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v4i32:
-; CHECK: uminv s0, v0.4s
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uminv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
-define i32 @uminv_v8i32(<8 x i32>* %a) #0 {
+define i32 @uminv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    uminv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
 define i32 @uminv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: uminv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
-; VBITS_EQ_256-DAG: uminv [[REDUCE:s[0-9]+]], [[PG]], [[MIN]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uminv_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    umin z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    uminv s0, p0, z0.s
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: uminv_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    uminv s0, p0, z0.s
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
-define i32 @uminv_v32i32(<32 x i32>* %a) #0 {
+define i32 @uminv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: uminv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    uminv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
-define i32 @uminv_v64i32(<64 x i32>* %a) #0 {
+define i32 @uminv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: uminv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    uminv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
 ; Nothing to do for single element vectors.
-define i64 @uminv_v1i64(<1 x i64> %a) #0 {
+define i64 @uminv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
 ; No NEON 64-bit vector UMINV support. Use SVE.
-define i64 @uminv_v2i64(<2 x i64> %a) #0 {
+define i64 @uminv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    uminv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
-define i64 @uminv_v4i64(<4 x i64>* %a) #0 {
+define i64 @uminv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uminv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
 define i64 @uminv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: uminv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
-; VBITS_EQ_256-DAG: uminv [[REDUCE:d[0-9]+]], [[PG]], [[MIN]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: uminv_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    umin z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_256-NEXT:    uminv d0, p0, z0.d
+; VBITS_GE_256-NEXT:    fmov x0, d0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: uminv_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    uminv d0, p0, z0.d
+; VBITS_GE_512-NEXT:    fmov x0, d0
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
-define i64 @uminv_v16i64(<16 x i64>* %a) #0 {
+define i64 @uminv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: uminv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uminv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
-define i64 @uminv_v32i64(<32 x i64>* %a) #0 {
+define i64 @uminv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: uminv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uminv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %op)
   ret i64 %res

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 1a7774bd1174..e0dea9c6c962 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -1,19 +1,8 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=VBITS_EQ_128
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048,VBITS_EQ_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,172 +13,171 @@ target triple = "aarch64-unknown-linux-gnu"
 ; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
-; CHECK-LABEL: srem_v8i8:
-; CHECK: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2:z[0-9]+]].b
-; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1:z[0-9]+]].b
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; CHECK-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; CHECK-NEXT: umov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].h[0]
-; CHECK-NEXT: umov [[SCALAR2:w[0-9]+]], [[VEC]].h[1]
-; CHECK-NEXT: fmov s3, [[SCALAR1]]
-; CHECK-NEXT: umov [[SCALAR3:w[0-9]+]], [[VEC]].h[2]
-; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR2]]
-; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR3]]
-; CHECK-NEXT: umov [[SCALAR4:w[0-9]+]], [[VEC]].h[3]
-; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR4]]
-; CHECK-NEXT: umov [[SCALAR5:w[0-9]+]], [[VEC]].h[4]
-; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR5]]
-; CHECK-NEXT: umov [[SCALAR6:w[0-9]+]], [[VEC]].h[5]
-; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR6]]
-; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[6]
-; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR7]]
-; CHECK-NEXT: umov [[SCALAR8:w[0-9]+]], [[VEC]].h[7]
-; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR8]]
-; CHECK-NEXT: mls v0.8b, [[FINAL]].8b, v1.8b
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: srem_v8i8:
-; VBITS_EQ_128:         sshll v2.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    sshll v3.8h, v0.8b, #0
-; VBITS_EQ_128-NEXT:    sunpkhi z4.s, z2.h
-; VBITS_EQ_128-NEXT:    sunpklo z2.s, z2.h
-; VBITS_EQ_128-NEXT:    sunpkhi z5.s, z3.h
-; VBITS_EQ_128-NEXT:    sunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    uzp1 z2.h, z2.h, z4.h
-; VBITS_EQ_128-NEXT:    xtn v2.8b, v2.8h
-; VBITS_EQ_128-NEXT:    mls v0.8b, v2.8b, v1.8b
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: srem_v8i8:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    sshll v2.8h, v1.8b, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sshll v3.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    sunpkhi z4.s, z2.h
+; VBITS_GE_128-NEXT:    sunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT:    sunpkhi z5.s, z3.h
+; VBITS_GE_128-NEXT:    sunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    uzp1 z2.h, z2.h, z4.h
+; VBITS_GE_128-NEXT:    xtn v2.8b, v2.8h
+; VBITS_GE_128-NEXT:    mls v0.8b, v2.8b, v1.8b
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: srem_v8i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_256-NEXT:    sunpklo z2.h, z1.b
+; VBITS_GE_256-NEXT:    sunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
+; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    umov w8, v2.h[0]
+; VBITS_GE_256-NEXT:    umov w9, v2.h[1]
+; VBITS_GE_256-NEXT:    fmov s3, w8
+; VBITS_GE_256-NEXT:    umov w8, v2.h[2]
+; VBITS_GE_256-NEXT:    mov v3.b[1], w9
+; VBITS_GE_256-NEXT:    mov v3.b[2], w8
+; VBITS_GE_256-NEXT:    umov w8, v2.h[3]
+; VBITS_GE_256-NEXT:    mov v3.b[3], w8
+; VBITS_GE_256-NEXT:    umov w8, v2.h[4]
+; VBITS_GE_256-NEXT:    mov v3.b[4], w8
+; VBITS_GE_256-NEXT:    umov w8, v2.h[5]
+; VBITS_GE_256-NEXT:    mov v3.b[5], w8
+; VBITS_GE_256-NEXT:    umov w8, v2.h[6]
+; VBITS_GE_256-NEXT:    mov v3.b[6], w8
+; VBITS_GE_256-NEXT:    umov w8, v2.h[7]
+; VBITS_GE_256-NEXT:    mov v3.b[7], w8
+; VBITS_GE_256-NEXT:    mls v0.8b, v3.8b, v1.8b
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: srem_v8i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT:    sunpklo z2.h, z1.b
+; VBITS_GE_512-NEXT:    sunpklo z3.h, z0.b
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    sunpklo z2.s, z2.h
+; VBITS_GE_512-NEXT:    sunpklo z3.s, z3.h
+; VBITS_GE_512-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT:    umov w8, v2.h[0]
+; VBITS_GE_512-NEXT:    umov w9, v2.h[1]
+; VBITS_GE_512-NEXT:    fmov s3, w8
+; VBITS_GE_512-NEXT:    umov w8, v2.h[2]
+; VBITS_GE_512-NEXT:    mov v3.b[1], w9
+; VBITS_GE_512-NEXT:    mov v3.b[2], w8
+; VBITS_GE_512-NEXT:    umov w8, v2.h[3]
+; VBITS_GE_512-NEXT:    mov v3.b[3], w8
+; VBITS_GE_512-NEXT:    umov w8, v2.h[4]
+; VBITS_GE_512-NEXT:    mov v3.b[4], w8
+; VBITS_GE_512-NEXT:    umov w8, v2.h[5]
+; VBITS_GE_512-NEXT:    mov v3.b[5], w8
+; VBITS_GE_512-NEXT:    umov w8, v2.h[6]
+; VBITS_GE_512-NEXT:    mov v3.b[6], w8
+; VBITS_GE_512-NEXT:    umov w8, v2.h[7]
+; VBITS_GE_512-NEXT:    mov v3.b[7], w8
+; VBITS_GE_512-NEXT:    mls v0.8b, v3.8b, v1.8b
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %res = srem <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
-; CHECK-LABEL: srem_v16i8:
-
-; HALF VECTOR
-; VBITS_EQ_256: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_256-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_256-NEXT: mls v0.16b, v2.16b, v1.16b
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_512: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: srem_v16i8:
-; VBITS_EQ_128:         sunpkhi z2.h, z1.b
-; VBITS_EQ_128-NEXT:    sunpkhi z3.h, z0.b
-; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    sunpkhi z5.s, z2.h
-; VBITS_EQ_128-NEXT:    sunpkhi z6.s, z3.h
-; VBITS_EQ_128-NEXT:    sunpklo z2.s, z2.h
-; VBITS_EQ_128-NEXT:    sunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT:    sunpklo z4.h, z1.b
-; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    sunpklo z3.h, z0.b
-; VBITS_EQ_128-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
-; VBITS_EQ_128-NEXT:    sunpkhi z6.s, z4.h
-; VBITS_EQ_128-NEXT:    sunpkhi z7.s, z3.h
-; VBITS_EQ_128-NEXT:    sunpklo z4.s, z4.h
-; VBITS_EQ_128-NEXT:    sunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
-; VBITS_EQ_128-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
-; VBITS_EQ_128-NEXT:    uzp1 z2.h, z2.h, z5.h
-; VBITS_EQ_128-NEXT:    uzp1 z3.h, z3.h, z6.h
-; VBITS_EQ_128-NEXT:    uzp1 z2.b, z3.b, z2.b
-; VBITS_EQ_128-NEXT:    mls v0.16b, v2.16b, v1.16b
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: srem_v16i8:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT:    sunpkhi z2.h, z1.b
+; VBITS_GE_128-NEXT:    sunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sunpkhi z5.s, z2.h
+; VBITS_GE_128-NEXT:    sunpkhi z6.s, z3.h
+; VBITS_GE_128-NEXT:    sunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT:    sunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT:    sunpklo z4.h, z1.b
+; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    sunpklo z3.h, z0.b
+; VBITS_GE_128-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT:    sunpkhi z6.s, z4.h
+; VBITS_GE_128-NEXT:    sunpkhi z7.s, z3.h
+; VBITS_GE_128-NEXT:    sunpklo z4.s, z4.h
+; VBITS_GE_128-NEXT:    sunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
+; VBITS_GE_128-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
+; VBITS_GE_128-NEXT:    uzp1 z2.h, z2.h, z5.h
+; VBITS_GE_128-NEXT:    uzp1 z3.h, z3.h, z6.h
+; VBITS_GE_128-NEXT:    uzp1 z2.b, z3.b, z2.b
+; VBITS_GE_128-NEXT:    mls v0.16b, v2.16b, v1.16b
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: srem_v16i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    sunpklo z2.h, z1.b
+; VBITS_GE_256-NEXT:    sunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    sunpkhi z4.s, z2.h
+; VBITS_GE_256-NEXT:    sunpkhi z5.s, z3.h
+; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
+; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z4.h
+; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT:    mls v0.16b, v2.16b, v1.16b
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: srem_v16i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    sunpklo z2.h, z1.b
+; VBITS_GE_512-NEXT:    sunpklo z3.h, z0.b
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    sunpklo z2.s, z2.h
+; VBITS_GE_512-NEXT:    sunpklo z3.s, z3.h
+; VBITS_GE_512-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT:    uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_512-NEXT:    mls v0.16b, v2.16b, v1.16b
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %res = srem <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: srem_v32i8:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_256-NEXT: sdivr [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_256-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b
-; VBITS_EQ_256-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_512-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_EQ_512-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z2.h, z1.b
+; CHECK-NEXT:    sunpklo z3.h, z0.b
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = srem <32 x i8> %op1, %op2
@@ -197,69 +185,23 @@ define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
   ret void
 }
 
-define void @srem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
+define void @srem_v64i8(<64 x i8>* %a, <64 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: srem_v64i8:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_512-NEXT: sdivr [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b
-; VBITS_EQ_512-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_1024-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_EQ_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_2048-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z2.h, z1.b
+; CHECK-NEXT:    sunpklo z3.h, z0.b
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = srem <64 x i8> %op1, %op2
@@ -267,54 +209,26 @@ define void @srem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @srem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @srem_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: srem_v128i8:
-
-; FULL VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: sdiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_1024-NEXT: sdivr [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b
-; VBITS_EQ_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_2048-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_2048-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_EQ_2048-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z2.h, z1.b
+; CHECK-NEXT:    sunpklo z3.h, z0.b
+; CHECK-NEXT:    sunpkhi z4.s, z2.h
+; CHECK-NEXT:    sunpkhi z5.s, z3.h
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z4.h
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = srem <128 x i8> %op1, %op2
@@ -322,36 +236,35 @@ define void @srem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @srem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @srem_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: srem_v256i8:
-
-; FULL VECTOR:
-; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: sdivr   [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sdivr   [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: sdiv    [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_2048-NEXT: sdivr   [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1    [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
-; VBITS_EQ_2048-NEXT: uzp1    [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
-; VBITS_EQ_2048-NEXT: uzp1    [[ZIP:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b
-; VBITS_EQ_2048-NEXT: mls     [[OP1]].b, [[PG]]/m, [[ZIP]].b, [[OP2]].b
-; VBITS_EQ_2048-NEXT: st1b    { [[OP1]].b }, [[PG]], [x0]
-; VBITS_EQ_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    sunpkhi z2.h, z1.b
+; CHECK-NEXT:    sunpkhi z3.h, z0.b
+; CHECK-NEXT:    sunpklo z4.h, z1.b
+; CHECK-NEXT:    sunpklo z5.h, z0.b
+; CHECK-NEXT:    sunpkhi z6.s, z2.h
+; CHECK-NEXT:    sunpkhi z7.s, z3.h
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sdivr z6.s, p1/m, z6.s, z7.s
+; CHECK-NEXT:    sunpkhi z7.s, z4.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    sunpkhi z3.s, z5.h
+; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sunpklo z5.s, z5.h
+; CHECK-NEXT:    sdiv z3.s, p1/m, z3.s, z7.s
+; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z6.h
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
+; CHECK-NEXT:    uzp1 z2.b, z3.b, z2.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = srem <256 x i8> %op1, %op2
@@ -362,93 +275,154 @@ define void @srem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 ; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
-; CHECK-LABEL: srem_v4i16:
-; CHECK: sshll v2.4s, v1.4h, #0
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
-; CHECK-NEXT: sshll v3.4s, v0.4h, #0
-; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, z2.s, z3.s
-; CHECK-NEXT: mov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].s[1]
-; CHECK-NEXT: mov [[SCALAR2:w[0-9]+]], [[VEC]].s[2]
-; CHECK-NEXT: mov [[VEC2:v[0-9]+]].16b, [[VEC]].16b
-; CHECK-NEXT: mov [[VEC2]].h[1], [[SCALAR1]]
-; CHECK-NEXT: mov [[SCALAR3:w[0-9]+]], [[VEC]].s[3]
-; CHECK-NEXT: mov [[VEC2]].h[2], [[SCALAR2]]
-; CHECK-NEXT: mov [[VEC2]].h[3], [[SCALAR3]]
-; CHECK-NEXT: mls v0.4h, [[VEC2]].4h, v1.4h
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: srem_v4i16:
-; VBITS_EQ_128:         sshll v2.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    sshll v3.4s, v0.4h, #0
-; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    xtn v2.4h, v2.4s
-; VBITS_EQ_128-NEXT:    mls v0.4h, v2.4h, v1.4h
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: srem_v4i16:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    sshll v2.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sshll v3.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    xtn v2.4h, v2.4s
+; VBITS_GE_128-NEXT:    mls v0.4h, v2.4h, v1.4h
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: srem_v4i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    sshll v2.4s, v1.4h, #0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    sshll v3.4s, v0.4h, #0
+; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    mov w8, v2.s[1]
+; VBITS_GE_256-NEXT:    mov w9, v2.s[2]
+; VBITS_GE_256-NEXT:    mov v3.16b, v2.16b
+; VBITS_GE_256-NEXT:    mov v3.h[1], w8
+; VBITS_GE_256-NEXT:    mov w8, v2.s[3]
+; VBITS_GE_256-NEXT:    mov v3.h[2], w9
+; VBITS_GE_256-NEXT:    mov v3.h[3], w8
+; VBITS_GE_256-NEXT:    mls v0.4h, v3.4h, v1.4h
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: srem_v4i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    sshll v2.4s, v1.4h, #0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_512-NEXT:    sshll v3.4s, v0.4h, #0
+; VBITS_GE_512-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT:    mov w8, v2.s[1]
+; VBITS_GE_512-NEXT:    mov w9, v2.s[2]
+; VBITS_GE_512-NEXT:    mov v3.16b, v2.16b
+; VBITS_GE_512-NEXT:    mov v3.h[1], w8
+; VBITS_GE_512-NEXT:    mov w8, v2.s[3]
+; VBITS_GE_512-NEXT:    mov v3.h[2], w9
+; VBITS_GE_512-NEXT:    mov v3.h[3], w8
+; VBITS_GE_512-NEXT:    mls v0.4h, v3.4h, v1.4h
+; VBITS_GE_512-NEXT:    ret
   %res = srem <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
-; CHECK-LABEL: srem_v8i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: srem_v8i16:
-; VBITS_EQ_128:         ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    sunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT:    sunpkhi z3.s, z0.h
-; VBITS_EQ_128-NEXT:    sunpklo z4.s, z1.h
-; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    sunpklo z5.s, z0.h
-; VBITS_EQ_128-NEXT:    movprfx z3, z5
-; VBITS_EQ_128-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
-; VBITS_EQ_128-NEXT:    uzp1 z2.h, z3.h, z2.h
-; VBITS_EQ_128-NEXT:    mls v0.8h, v2.8h, v1.8h
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: srem_v8i16:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sunpkhi z2.s, z1.h
+; VBITS_GE_128-NEXT:    sunpkhi z3.s, z0.h
+; VBITS_GE_128-NEXT:    sunpklo z4.s, z1.h
+; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    sunpklo z5.s, z0.h
+; VBITS_GE_128-NEXT:    movprfx z3, z5
+; VBITS_GE_128-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
+; VBITS_GE_128-NEXT:    uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_128-NEXT:    mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: srem_v8i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: srem_v8i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    sunpklo z2.s, z1.h
+; VBITS_GE_512-NEXT:    sunpklo z3.s, z0.h
+; VBITS_GE_512-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT:    mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %res = srem <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
 define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; CHECK-LABEL: srem_v16i16:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_256-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_EQ_256-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_512-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-; CHECK: ret
-
+; VBITS_GE_128-LABEL: srem_v16i16:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ldp q2, q0, [x0]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sunpkhi z17.s, z2.h
+; VBITS_GE_128-NEXT:    ldp q3, q1, [x1]
+; VBITS_GE_128-NEXT:    sunpkhi z5.s, z0.h
+; VBITS_GE_128-NEXT:    sunpklo z7.s, z0.h
+; VBITS_GE_128-NEXT:    sunpkhi z16.s, z3.h
+; VBITS_GE_128-NEXT:    sdivr z16.s, p0/m, z16.s, z17.s
+; VBITS_GE_128-NEXT:    sunpkhi z4.s, z1.h
+; VBITS_GE_128-NEXT:    sunpklo z6.s, z1.h
+; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    sunpklo z5.s, z3.h
+; VBITS_GE_128-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
+; VBITS_GE_128-NEXT:    sunpklo z7.s, z2.h
+; VBITS_GE_128-NEXT:    sdivr z5.s, p0/m, z5.s, z7.s
+; VBITS_GE_128-NEXT:    uzp1 z4.h, z6.h, z4.h
+; VBITS_GE_128-NEXT:    uzp1 z5.h, z5.h, z16.h
+; VBITS_GE_128-NEXT:    mls v2.8h, v5.8h, v3.8h
+; VBITS_GE_128-NEXT:    mls v0.8h, v4.8h, v1.8h
+; VBITS_GE_128-NEXT:    stp q2, q0, [x0]
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: srem_v16i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sunpkhi z2.s, z1.h
+; VBITS_GE_256-NEXT:    sunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    sunpklo z4.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z5.s, z0.h
+; VBITS_GE_256-NEXT:    movprfx z3, z5
+; VBITS_GE_256-NEXT:    sdiv z3.s, p1/m, z3.s, z4.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_256-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: srem_v16i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    sunpklo z2.s, z1.h
+; VBITS_GE_512-NEXT:    sunpklo z3.s, z0.h
+; VBITS_GE_512-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = srem <16 x i16> %op1, %op2
@@ -456,37 +430,20 @@ define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
   ret void
 }
 
-define void @srem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+define void @srem_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: srem_v32i16:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_512-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_EQ_512-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = srem <32 x i16> %op1, %op2
@@ -494,35 +451,20 @@ define void @srem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @srem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @srem_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: srem_v64i16:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_1024-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_EQ_1024-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = srem <64 x i16> %op1, %op2
@@ -530,23 +472,24 @@ define void @srem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @srem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @srem_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: srem_v128i16:
-; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: sdivr   [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: movprfx [[OP3_LO:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_2048-NEXT: sdiv    [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP3_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[ZIP:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
-; VBITS_EQ_2048-NEXT: mls [[OP1]].h, [[PG]]/m, [[ZIP]].h, [[OP2]].h
-; VBITS_EQ_2048-NEXT: st1h { [[OP1]].h }, [[PG]], [x0]
-; VBITS_EQ_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    sunpkhi z2.s, z1.h
+; CHECK-NEXT:    sunpkhi z3.s, z0.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    sunpklo z4.s, z1.h
+; CHECK-NEXT:    sunpklo z5.s, z0.h
+; CHECK-NEXT:    movprfx z3, z5
+; CHECK-NEXT:    sdiv z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = srem <128 x i16> %op1, %op2
@@ -555,55 +498,48 @@ define void @srem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Vector v2i32 sdiv are not legal for NEON so use SVE when available.
-define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: srem_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], z0
-; CHECK-NEXT: sdiv z2.s, [[PG]]/m, [[PFX]].s, z1.s
-; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: srem_v2i32:
-; VBITS_EQ_128:         ptrue p0.s, vl2
-; VBITS_EQ_128-NEXT:    movprfx z2, z0
-; VBITS_EQ_128-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
-; VBITS_EQ_128-NEXT:    mls v0.2s, v2.2s, v1.2s
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = srem <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Vector v4i32 sdiv are not legal for NEON so use SVE when available.
-define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: srem_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], z0
-; CHECK-NEXT: sdiv z2.s, [[PG]]/m, [[PFX]].s, z1.s
-; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
-; CHECK-NEXT: ret
-
-; VBITS_EQ_128-LABEL: srem_v4i32:
-; VBITS_EQ_128:         ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    movprfx z2, z0
-; VBITS_EQ_128-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
-; VBITS_EQ_128-NEXT:    mls v0.4s, v2.4s, v1.4s
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = srem <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @srem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @srem_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: srem_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; CHECK-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = srem <8 x i32> %op1, %op2
@@ -612,15 +548,57 @@ define void @srem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @srem_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: srem_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_512-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: srem_v16i32:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    ldp q4, q5, [x1, #32]
+; VBITS_GE_128-NEXT:    ldp q7, q6, [x1]
+; VBITS_GE_128-NEXT:    movprfx z16, z0
+; VBITS_GE_128-NEXT:    sdiv z16.s, p0/m, z16.s, z4.s
+; VBITS_GE_128-NEXT:    mls v0.4s, v16.4s, v4.4s
+; VBITS_GE_128-NEXT:    movprfx z4, z3
+; VBITS_GE_128-NEXT:    sdiv z4.s, p0/m, z4.s, z6.s
+; VBITS_GE_128-NEXT:    movprfx z16, z1
+; VBITS_GE_128-NEXT:    sdiv z16.s, p0/m, z16.s, z5.s
+; VBITS_GE_128-NEXT:    mls v1.4s, v16.4s, v5.4s
+; VBITS_GE_128-NEXT:    movprfx z5, z2
+; VBITS_GE_128-NEXT:    sdiv z5.s, p0/m, z5.s, z7.s
+; VBITS_GE_128-NEXT:    mls v2.4s, v5.4s, v7.4s
+; VBITS_GE_128-NEXT:    mls v3.4s, v4.4s, v6.4s
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    stp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: srem_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    movprfx z4, z0
+; VBITS_GE_256-NEXT:    sdiv z4.s, p0/m, z4.s, z2.s
+; VBITS_GE_256-NEXT:    movprfx z5, z1
+; VBITS_GE_256-NEXT:    sdiv z5.s, p0/m, z5.s, z3.s
+; VBITS_GE_256-NEXT:    mls z0.s, p0/m, z4.s, z2.s
+; VBITS_GE_256-NEXT:    mls z1.s, p0/m, z5.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: srem_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    movprfx z2, z0
+; VBITS_GE_512-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
+; VBITS_GE_512-NEXT:    mls z0.s, p0/m, z2.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = srem <16 x i32> %op1, %op2
@@ -628,16 +606,17 @@ define void @srem_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @srem_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @srem_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: srem_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_1024-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = srem <32 x i32> %op1, %op2
@@ -645,16 +624,17 @@ define void @srem_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @srem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @srem_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: srem_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_2048-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = srem <64 x i32> %op1, %op2
@@ -664,60 +644,49 @@ define void @srem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 
 ; Vector i64 sdiv are not legal for NEON so use SVE when available.
 ; FIXME: We should be able to improve the codegen for the 128 bits case here.
-define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: srem_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, z1.d
-; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d
-; CHECK-NEXT: sub d0, d0, d1
-; CHECK-NEXT: ret
-
-; VBITS_EQ_128-LABEL: srem_v1i64:
-; VBITS_EQ_128:         ptrue p0.d, vl1
-; VBITS_EQ_128-NEXT:    movprfx z2, z0
-; VBITS_EQ_128-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z2.d
-; VBITS_EQ_128-NEXT:    sub d0, d0, d1
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    sub d0, d0, d1
+; CHECK-NEXT:    ret
   %res = srem <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Vector i64 sdiv are not legal for NEON so use SVE when available.
 ; FIXME: We should be able to improve the codegen for the 128 bits case here.
-define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: srem_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, z1.d
-; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d
-; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
-
-; VBITS_EQ_128-LABEL: srem_v2i64:
-; VBITS_EQ_128:         ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT:    movprfx z2, z0
-; VBITS_EQ_128-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z2.d
-; VBITS_EQ_128-NEXT:    sub v0.2d, v0.2d, v1.2d
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = srem <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @srem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @srem_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: srem_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; CHECK-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = srem <4 x i64> %op1, %op2
@@ -726,15 +695,61 @@ define void @srem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @srem_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: srem_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_512-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: srem_v8i64:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ldp q4, q5, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
+; VBITS_GE_128-NEXT:    ldp q7, q6, [x1, #32]
+; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    movprfx z16, z3
+; VBITS_GE_128-NEXT:    sdiv z16.d, p0/m, z16.d, z5.d
+; VBITS_GE_128-NEXT:    movprfx z17, z2
+; VBITS_GE_128-NEXT:    sdiv z17.d, p0/m, z17.d, z4.d
+; VBITS_GE_128-NEXT:    mul z5.d, p0/m, z5.d, z16.d
+; VBITS_GE_128-NEXT:    movprfx z16, z1
+; VBITS_GE_128-NEXT:    sdiv z16.d, p0/m, z16.d, z6.d
+; VBITS_GE_128-NEXT:    mul z4.d, p0/m, z4.d, z17.d
+; VBITS_GE_128-NEXT:    movprfx z17, z0
+; VBITS_GE_128-NEXT:    sdiv z17.d, p0/m, z17.d, z7.d
+; VBITS_GE_128-NEXT:    mul z6.d, p0/m, z6.d, z16.d
+; VBITS_GE_128-NEXT:    mul z7.d, p0/m, z7.d, z17.d
+; VBITS_GE_128-NEXT:    sub v0.2d, v0.2d, v7.2d
+; VBITS_GE_128-NEXT:    sub v1.2d, v1.2d, v6.2d
+; VBITS_GE_128-NEXT:    sub v2.2d, v2.2d, v4.2d
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    sub v0.2d, v3.2d, v5.2d
+; VBITS_GE_128-NEXT:    stp q2, q0, [x0]
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: srem_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    movprfx z4, z0
+; VBITS_GE_256-NEXT:    sdiv z4.d, p0/m, z4.d, z2.d
+; VBITS_GE_256-NEXT:    movprfx z5, z1
+; VBITS_GE_256-NEXT:    sdiv z5.d, p0/m, z5.d, z3.d
+; VBITS_GE_256-NEXT:    mls z0.d, p0/m, z4.d, z2.d
+; VBITS_GE_256-NEXT:    mls z1.d, p0/m, z5.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: srem_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    movprfx z2, z0
+; VBITS_GE_512-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
+; VBITS_GE_512-NEXT:    mls z0.d, p0/m, z2.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = srem <8 x i64> %op1, %op2
@@ -742,16 +757,17 @@ define void @srem_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @srem_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @srem_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: srem_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_1024-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = srem <16 x i64> %op1, %op2
@@ -759,16 +775,17 @@ define void @srem_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @srem_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @srem_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: srem_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_2048-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = srem <32 x i64> %op1, %op2
@@ -783,172 +800,171 @@ define void @srem_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ; Vector vXi8 udiv are not legal for NEON so use SVE when available.
 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
-; CHECK-LABEL: urem_v8i8:
-; CHECK: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2:z[0-9]+]].b
-; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1:z[0-9]+]].b
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; CHECK-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; CHECK-NEXT: umov [[SCALAR0:w[0-9]+]], [[VEC:v[0-9]+]].h[0]
-; CHECK-NEXT: umov [[SCALAR1:w[0-9]+]], [[VEC]].h[1]
-; CHECK-NEXT: fmov s3, [[SCALAR0]]
-; CHECK-NEXT: umov [[SCALAR2:w[0-9]+]], [[VEC]].h[2]
-; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR1]]
-; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR2]]
-; CHECK-NEXT: umov [[SCALAR3:w[0-9]+]], [[VEC]].h[3]
-; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR3]]
-; CHECK-NEXT: umov [[SCALAR4:w[0-9]+]], [[VEC]].h[4]
-; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR4]]
-; CHECK-NEXT: umov [[SCALAR5:w[0-9]+]], [[VEC]].h[5]
-; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR5]]
-; CHECK-NEXT: umov [[SCALAR6:w[0-9]+]], [[VEC]].h[6]
-; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR6]]
-; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7]
-; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]]
-; CHECK-NEXT: mls v0.8b, [[FINAL]].8b, v1.8b
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: urem_v8i8:
-; VBITS_EQ_128:         ushll v2.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    ushll v3.8h, v0.8b, #0
-; VBITS_EQ_128-NEXT:    uunpkhi z4.s, z2.h
-; VBITS_EQ_128-NEXT:    uunpklo z2.s, z2.h
-; VBITS_EQ_128-NEXT:    uunpkhi z5.s, z3.h
-; VBITS_EQ_128-NEXT:    uunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    uzp1 z2.h, z2.h, z4.h
-; VBITS_EQ_128-NEXT:    xtn v2.8b, v2.8h
-; VBITS_EQ_128-NEXT:    mls v0.8b, v2.8b, v1.8b
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: urem_v8i8:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ushll v2.8h, v1.8b, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ushll v3.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    uunpkhi z4.s, z2.h
+; VBITS_GE_128-NEXT:    uunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT:    uunpkhi z5.s, z3.h
+; VBITS_GE_128-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    uzp1 z2.h, z2.h, z4.h
+; VBITS_GE_128-NEXT:    xtn v2.8b, v2.8h
+; VBITS_GE_128-NEXT:    mls v0.8b, v2.8b, v1.8b
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: urem_v8i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_256-NEXT:    uunpklo z2.h, z1.b
+; VBITS_GE_256-NEXT:    uunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    umov w8, v2.h[0]
+; VBITS_GE_256-NEXT:    umov w9, v2.h[1]
+; VBITS_GE_256-NEXT:    fmov s3, w8
+; VBITS_GE_256-NEXT:    umov w8, v2.h[2]
+; VBITS_GE_256-NEXT:    mov v3.b[1], w9
+; VBITS_GE_256-NEXT:    mov v3.b[2], w8
+; VBITS_GE_256-NEXT:    umov w8, v2.h[3]
+; VBITS_GE_256-NEXT:    mov v3.b[3], w8
+; VBITS_GE_256-NEXT:    umov w8, v2.h[4]
+; VBITS_GE_256-NEXT:    mov v3.b[4], w8
+; VBITS_GE_256-NEXT:    umov w8, v2.h[5]
+; VBITS_GE_256-NEXT:    mov v3.b[5], w8
+; VBITS_GE_256-NEXT:    umov w8, v2.h[6]
+; VBITS_GE_256-NEXT:    mov v3.b[6], w8
+; VBITS_GE_256-NEXT:    umov w8, v2.h[7]
+; VBITS_GE_256-NEXT:    mov v3.b[7], w8
+; VBITS_GE_256-NEXT:    mls v0.8b, v3.8b, v1.8b
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: urem_v8i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $d1 killed $d1 def $z1
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT:    uunpklo z2.h, z1.b
+; VBITS_GE_512-NEXT:    uunpklo z3.h, z0.b
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    uunpklo z2.s, z2.h
+; VBITS_GE_512-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_512-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT:    umov w8, v2.h[0]
+; VBITS_GE_512-NEXT:    umov w9, v2.h[1]
+; VBITS_GE_512-NEXT:    fmov s3, w8
+; VBITS_GE_512-NEXT:    umov w8, v2.h[2]
+; VBITS_GE_512-NEXT:    mov v3.b[1], w9
+; VBITS_GE_512-NEXT:    mov v3.b[2], w8
+; VBITS_GE_512-NEXT:    umov w8, v2.h[3]
+; VBITS_GE_512-NEXT:    mov v3.b[3], w8
+; VBITS_GE_512-NEXT:    umov w8, v2.h[4]
+; VBITS_GE_512-NEXT:    mov v3.b[4], w8
+; VBITS_GE_512-NEXT:    umov w8, v2.h[5]
+; VBITS_GE_512-NEXT:    mov v3.b[5], w8
+; VBITS_GE_512-NEXT:    umov w8, v2.h[6]
+; VBITS_GE_512-NEXT:    mov v3.b[6], w8
+; VBITS_GE_512-NEXT:    umov w8, v2.h[7]
+; VBITS_GE_512-NEXT:    mov v3.b[7], w8
+; VBITS_GE_512-NEXT:    mls v0.8b, v3.8b, v1.8b
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %res = urem <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
-; CHECK-LABEL: urem_v16i8:
-
-; HALF VECTOR
-; VBITS_EQ_256: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_256-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_256-NEXT: mls v0.16b, v2.16b, v1.16b
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_512: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: urem_v16i8:
-; VBITS_EQ_128:         uunpkhi z2.h, z1.b
-; VBITS_EQ_128-NEXT:    uunpkhi z3.h, z0.b
-; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    uunpkhi z5.s, z2.h
-; VBITS_EQ_128-NEXT:    uunpkhi z6.s, z3.h
-; VBITS_EQ_128-NEXT:    uunpklo z2.s, z2.h
-; VBITS_EQ_128-NEXT:    uunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT:    uunpklo z4.h, z1.b
-; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    uunpklo z3.h, z0.b
-; VBITS_EQ_128-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
-; VBITS_EQ_128-NEXT:    uunpkhi z6.s, z4.h
-; VBITS_EQ_128-NEXT:    uunpkhi z7.s, z3.h
-; VBITS_EQ_128-NEXT:    uunpklo z4.s, z4.h
-; VBITS_EQ_128-NEXT:    uunpklo z3.s, z3.h
-; VBITS_EQ_128-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
-; VBITS_EQ_128-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
-; VBITS_EQ_128-NEXT:    uzp1 z2.h, z2.h, z5.h
-; VBITS_EQ_128-NEXT:    uzp1 z3.h, z3.h, z6.h
-; VBITS_EQ_128-NEXT:    uzp1 z2.b, z3.b, z2.b
-; VBITS_EQ_128-NEXT:    mls v0.16b, v2.16b, v1.16b
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: urem_v16i8:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT:    uunpkhi z2.h, z1.b
+; VBITS_GE_128-NEXT:    uunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    uunpkhi z5.s, z2.h
+; VBITS_GE_128-NEXT:    uunpkhi z6.s, z3.h
+; VBITS_GE_128-NEXT:    uunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT:    uunpklo z4.h, z1.b
+; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    uunpklo z3.h, z0.b
+; VBITS_GE_128-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT:    uunpkhi z6.s, z4.h
+; VBITS_GE_128-NEXT:    uunpkhi z7.s, z3.h
+; VBITS_GE_128-NEXT:    uunpklo z4.s, z4.h
+; VBITS_GE_128-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
+; VBITS_GE_128-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
+; VBITS_GE_128-NEXT:    uzp1 z2.h, z2.h, z5.h
+; VBITS_GE_128-NEXT:    uzp1 z3.h, z3.h, z6.h
+; VBITS_GE_128-NEXT:    uzp1 z2.b, z3.b, z2.b
+; VBITS_GE_128-NEXT:    mls v0.16b, v2.16b, v1.16b
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: urem_v16i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    uunpklo z2.h, z1.b
+; VBITS_GE_256-NEXT:    uunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    uunpkhi z4.s, z2.h
+; VBITS_GE_256-NEXT:    uunpkhi z5.s, z3.h
+; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z4.h
+; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT:    mls v0.16b, v2.16b, v1.16b
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: urem_v16i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    uunpklo z2.h, z1.b
+; VBITS_GE_512-NEXT:    uunpklo z3.h, z0.b
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    uunpklo z2.s, z2.h
+; VBITS_GE_512-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_512-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT:    uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_512-NEXT:    mls v0.16b, v2.16b, v1.16b
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %res = urem <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: urem_v32i8:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_256-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_256-NEXT: udivr [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_256-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_256-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b
-; VBITS_EQ_256-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_512-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_EQ_512-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    uunpklo z2.h, z1.b
+; CHECK-NEXT:    uunpklo z3.h, z0.b
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = urem <32 x i8> %op1, %op2
@@ -956,69 +972,23 @@ define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
   ret void
 }
 
-define void @urem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
+define void @urem_v64i8(<64 x i8>* %a, <64 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: urem_v64i8:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_512-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_512-NEXT: udivr [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_512-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_512-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b
-; VBITS_EQ_512-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_1024-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_EQ_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; QUARTER VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_GE_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_GE_2048-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    uunpklo z2.h, z1.b
+; CHECK-NEXT:    uunpklo z3.h, z0.b
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = urem <64 x i8> %op1, %op2
@@ -1026,54 +996,26 @@ define void @urem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @urem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @urem_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: urem_v128i8:
-
-; FULL VECTOR:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_1024-NEXT: udiv [[DIV3:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_1024-NEXT: udivr [[DIV4:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b
-; VBITS_EQ_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b
-; VBITS_EQ_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-
-; HALF VECTOR:
-; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_HI]].s, [[OP1_LO_HI]].s
-; VBITS_EQ_2048-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
-; VBITS_EQ_2048-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b
-; VBITS_EQ_2048-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    uunpklo z2.h, z1.b
+; CHECK-NEXT:    uunpklo z3.h, z0.b
+; CHECK-NEXT:    uunpkhi z4.s, z2.h
+; CHECK-NEXT:    uunpkhi z5.s, z3.h
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z4.h
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = urem <128 x i8> %op1, %op2
@@ -1081,34 +1023,35 @@ define void @urem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @urem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @urem_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: urem_v256i8:
-; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
-; VBITS_EQ_2048-NEXT: udivr   [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: udivr   [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
-; VBITS_EQ_2048-NEXT: udiv    [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
-; VBITS_EQ_2048-NEXT: udivr   [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1    [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
-; VBITS_EQ_2048-NEXT: uzp1    [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
-; VBITS_EQ_2048-NEXT: uzp1    [[ZIP:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b
-; VBITS_EQ_2048-NEXT: mls     [[OP1]].b, [[PG]]/m, [[ZIP]].b, [[OP2]].b
-; VBITS_EQ_2048-NEXT: st1b    { [[OP1]].b }, [[PG]], [x0]
-; VBITS_EQ_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    uunpkhi z2.h, z1.b
+; CHECK-NEXT:    uunpkhi z3.h, z0.b
+; CHECK-NEXT:    uunpklo z4.h, z1.b
+; CHECK-NEXT:    uunpklo z5.h, z0.b
+; CHECK-NEXT:    uunpkhi z6.s, z2.h
+; CHECK-NEXT:    uunpkhi z7.s, z3.h
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    udivr z6.s, p1/m, z6.s, z7.s
+; CHECK-NEXT:    uunpkhi z7.s, z4.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uunpkhi z3.s, z5.h
+; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    uunpklo z5.s, z5.h
+; CHECK-NEXT:    udiv z3.s, p1/m, z3.s, z7.s
+; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z6.h
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
+; CHECK-NEXT:    uzp1 z2.b, z3.b, z2.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = urem <256 x i8> %op1, %op2
@@ -1119,92 +1062,154 @@ define void @urem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 ; Vector vXi16 udiv are not legal for NEON so use SVE when available.
 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
-; CHECK-LABEL: urem_v4i16:
-; CHECK: ushll v2.4s, v1.4h, #0
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
-; CHECK-NEXT: ushll v3.4s, v0.4h, #0
-; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, z2.s, z3.s
-; CHECK-NEXT: mov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].s[1]
-; CHECK-NEXT: mov [[SCALAR2:w[0-9]+]], [[VEC]].s[2]
-; CHECK-NEXT: mov v3.16b, v2.16b
-; CHECK-NEXT: mov [[VECO:v[0-9]+]].h[1], [[SCALAR1]]
-; CHECK-NEXT: mov [[SCALAR3:w[0-9]+]], [[VEC]].s[3]
-; CHECK-NEXT: mov [[VECO]].h[2], [[SCALAR2]]
-; CHECK-NEXT: mov [[VECO]].h[3], [[SCALAR3]]
-; CHECK-NEXT: mls v0.4h, [[VECO]].4h, v1.4h
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: urem_v4i16:
-; VBITS_EQ_128:         ushll v2.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    ushll v3.4s, v0.4h, #0
-; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    xtn v2.4h, v2.4s
-; VBITS_EQ_128-NEXT:    mls v0.4h, v2.4h, v1.4h
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: urem_v4i16:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ushll v2.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ushll v3.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    xtn v2.4h, v2.4s
+; VBITS_GE_128-NEXT:    mls v0.4h, v2.4h, v1.4h
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: urem_v4i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ushll v2.4s, v1.4h, #0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    ushll v3.4s, v0.4h, #0
+; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    mov w8, v2.s[1]
+; VBITS_GE_256-NEXT:    mov w9, v2.s[2]
+; VBITS_GE_256-NEXT:    mov v3.16b, v2.16b
+; VBITS_GE_256-NEXT:    mov v3.h[1], w8
+; VBITS_GE_256-NEXT:    mov w8, v2.s[3]
+; VBITS_GE_256-NEXT:    mov v3.h[2], w9
+; VBITS_GE_256-NEXT:    mov v3.h[3], w8
+; VBITS_GE_256-NEXT:    mls v0.4h, v3.4h, v1.4h
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: urem_v4i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ushll v2.4s, v1.4h, #0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_512-NEXT:    ushll v3.4s, v0.4h, #0
+; VBITS_GE_512-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT:    mov w8, v2.s[1]
+; VBITS_GE_512-NEXT:    mov w9, v2.s[2]
+; VBITS_GE_512-NEXT:    mov v3.16b, v2.16b
+; VBITS_GE_512-NEXT:    mov v3.h[1], w8
+; VBITS_GE_512-NEXT:    mov w8, v2.s[3]
+; VBITS_GE_512-NEXT:    mov v3.h[2], w9
+; VBITS_GE_512-NEXT:    mov v3.h[3], w8
+; VBITS_GE_512-NEXT:    mls v0.4h, v3.4h, v1.4h
+; VBITS_GE_512-NEXT:    ret
   %res = urem <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
-; CHECK-LABEL: urem_v8i16:
-; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
-; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: urem_v8i16:
-; VBITS_EQ_128:         ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    uunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT:    uunpkhi z3.s, z0.h
-; VBITS_EQ_128-NEXT:    uunpklo z4.s, z1.h
-; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_EQ_128-NEXT:    uunpklo z5.s, z0.h
-; VBITS_EQ_128-NEXT:    movprfx z3, z5
-; VBITS_EQ_128-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
-; VBITS_EQ_128-NEXT:    uzp1 z2.h, z3.h, z2.h
-; VBITS_EQ_128-NEXT:    mls v0.8h, v2.8h, v1.8h
-; VBITS_EQ_128-NEXT:    ret
-
+; VBITS_GE_128-LABEL: urem_v8i16:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    uunpkhi z2.s, z1.h
+; VBITS_GE_128-NEXT:    uunpkhi z3.s, z0.h
+; VBITS_GE_128-NEXT:    uunpklo z4.s, z1.h
+; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    uunpklo z5.s, z0.h
+; VBITS_GE_128-NEXT:    movprfx z3, z5
+; VBITS_GE_128-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
+; VBITS_GE_128-NEXT:    uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_128-NEXT:    mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: urem_v8i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: urem_v8i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    uunpklo z2.s, z1.h
+; VBITS_GE_512-NEXT:    uunpklo z3.s, z0.h
+; VBITS_GE_512-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT:    mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %res = urem <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
 define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; CHECK-LABEL: urem_v16i16:
-
-; FULL VECTOR:
-; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_256-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_256-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s
-; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_256-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_EQ_256-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_512-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-; CHECK: ret
+; VBITS_GE_128-LABEL: urem_v16i16:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ldp q2, q0, [x0]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    uunpkhi z17.s, z2.h
+; VBITS_GE_128-NEXT:    ldp q3, q1, [x1]
+; VBITS_GE_128-NEXT:    uunpkhi z5.s, z0.h
+; VBITS_GE_128-NEXT:    uunpklo z7.s, z0.h
+; VBITS_GE_128-NEXT:    uunpkhi z16.s, z3.h
+; VBITS_GE_128-NEXT:    udivr z16.s, p0/m, z16.s, z17.s
+; VBITS_GE_128-NEXT:    uunpkhi z4.s, z1.h
+; VBITS_GE_128-NEXT:    uunpklo z6.s, z1.h
+; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    uunpklo z5.s, z3.h
+; VBITS_GE_128-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
+; VBITS_GE_128-NEXT:    uunpklo z7.s, z2.h
+; VBITS_GE_128-NEXT:    udivr z5.s, p0/m, z5.s, z7.s
+; VBITS_GE_128-NEXT:    uzp1 z4.h, z6.h, z4.h
+; VBITS_GE_128-NEXT:    uzp1 z5.h, z5.h, z16.h
+; VBITS_GE_128-NEXT:    mls v2.8h, v5.8h, v3.8h
+; VBITS_GE_128-NEXT:    mls v0.8h, v4.8h, v1.8h
+; VBITS_GE_128-NEXT:    stp q2, q0, [x0]
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: urem_v16i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    uunpkhi z2.s, z1.h
+; VBITS_GE_256-NEXT:    uunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    uunpklo z4.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z5.s, z0.h
+; VBITS_GE_256-NEXT:    movprfx z3, z5
+; VBITS_GE_256-NEXT:    udiv z3.s, p1/m, z3.s, z4.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_256-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: urem_v16i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    uunpklo z2.s, z1.h
+; VBITS_GE_512-NEXT:    uunpklo z3.s, z0.h
+; VBITS_GE_512-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_512-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = urem <16 x i16> %op1, %op2
@@ -1212,37 +1217,20 @@ define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
   ret void
 }
 
-define void @urem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+define void @urem_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: urem_v32i16:
-
-; FULL VECTOR:
-; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
-; VBITS_EQ_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_512-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_512-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s
-; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_512-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_EQ_512-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_1024-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpklo z3.s, z0.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = urem <32 x i16> %op1, %op2
@@ -1250,35 +1238,20 @@ define void @urem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @urem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @urem_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: urem_v64i16:
-; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
-; VBITS_EQ_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_1024-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_1024-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s
-; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h
-; VBITS_EQ_1024-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_EQ_1024-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-
-; HALF VECTOR OR SMALLER:
-; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_GE_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s
-; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
-; VBITS_GE_2048-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpklo z3.s, z0.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = urem <64 x i16> %op1, %op2
@@ -1286,23 +1259,24 @@ define void @urem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @urem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @urem_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: urem_v128i16:
-; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64
-; VBITS_EQ_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: udivr   [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
-; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
-; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
-; VBITS_EQ_2048-NEXT: movprfx [[RES_LO:z[0-9]+]], [[OP1_LO]]
-; VBITS_EQ_2048-NEXT: udiv    [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[RES_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_2048-NEXT: uzp1 [[ZIP:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
-; VBITS_EQ_2048-NEXT: mls [[OP1]].h, [[PG]]/m, [[ZIP]].h, [[OP2]].h
-; VBITS_EQ_2048-NEXT: st1h { [[OP1]].h }, [[PG]], [x0]
-; VBITS_EQ_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    uunpkhi z2.s, z1.h
+; CHECK-NEXT:    uunpkhi z3.s, z0.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uunpklo z4.s, z1.h
+; CHECK-NEXT:    uunpklo z5.s, z0.h
+; CHECK-NEXT:    movprfx z3, z5
+; CHECK-NEXT:    udiv z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = urem <128 x i16> %op1, %op2
@@ -1311,55 +1285,48 @@ define void @urem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Vector v2i32 udiv are not legal for NEON so use SVE when available.
-define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: urem_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], z0
-; CHECK-NEXT: udiv z2.s, [[PG]]/m, [[PFX]].s, z1.s
-; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s
-; CHECK: ret
-
-; VBITS_EQ_128-LABEL: urem_v2i32:
-; VBITS_EQ_128:         ptrue p0.s, vl2
-; VBITS_EQ_128-NEXT:    movprfx z2, z0
-; VBITS_EQ_128-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
-; VBITS_EQ_128-NEXT:    mls v0.2s, v2.2s, v1.2s
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = urem <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Vector v4i32 udiv are not legal for NEON so use SVE when available.
-define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: urem_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], z0
-; CHECK-NEXT: udiv z2.s, [[PG]]/m, [[PFX]].s, z1.s
-; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
-; CHECK-NEXT: ret
-
-; VBITS_EQ_128-LABEL: urem_v4i32:
-; VBITS_EQ_128:         ptrue p0.s, vl4
-; VBITS_EQ_128-NEXT:    movprfx z2, z0
-; VBITS_EQ_128-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
-; VBITS_EQ_128-NEXT:    mls v0.4s, v2.4s, v1.4s
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = urem <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @urem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @urem_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: urem_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; CHECK-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = urem <8 x i32> %op1, %op2
@@ -1368,15 +1335,57 @@ define void @urem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @urem_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: urem_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_512-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: urem_v16i32:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    ldp q4, q5, [x1, #32]
+; VBITS_GE_128-NEXT:    ldp q7, q6, [x1]
+; VBITS_GE_128-NEXT:    movprfx z16, z0
+; VBITS_GE_128-NEXT:    udiv z16.s, p0/m, z16.s, z4.s
+; VBITS_GE_128-NEXT:    mls v0.4s, v16.4s, v4.4s
+; VBITS_GE_128-NEXT:    movprfx z4, z3
+; VBITS_GE_128-NEXT:    udiv z4.s, p0/m, z4.s, z6.s
+; VBITS_GE_128-NEXT:    movprfx z16, z1
+; VBITS_GE_128-NEXT:    udiv z16.s, p0/m, z16.s, z5.s
+; VBITS_GE_128-NEXT:    mls v1.4s, v16.4s, v5.4s
+; VBITS_GE_128-NEXT:    movprfx z5, z2
+; VBITS_GE_128-NEXT:    udiv z5.s, p0/m, z5.s, z7.s
+; VBITS_GE_128-NEXT:    mls v2.4s, v5.4s, v7.4s
+; VBITS_GE_128-NEXT:    mls v3.4s, v4.4s, v6.4s
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    stp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: urem_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    movprfx z4, z0
+; VBITS_GE_256-NEXT:    udiv z4.s, p0/m, z4.s, z2.s
+; VBITS_GE_256-NEXT:    movprfx z5, z1
+; VBITS_GE_256-NEXT:    udiv z5.s, p0/m, z5.s, z3.s
+; VBITS_GE_256-NEXT:    mls z0.s, p0/m, z4.s, z2.s
+; VBITS_GE_256-NEXT:    mls z1.s, p0/m, z5.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: urem_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    movprfx z2, z0
+; VBITS_GE_512-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
+; VBITS_GE_512-NEXT:    mls z0.s, p0/m, z2.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = urem <16 x i32> %op1, %op2
@@ -1384,16 +1393,17 @@ define void @urem_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @urem_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @urem_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: urem_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_1024-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = urem <32 x i32> %op1, %op2
@@ -1401,16 +1411,17 @@ define void @urem_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @urem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @urem_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: urem_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_2048-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[OP1]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = urem <64 x i32> %op1, %op2
@@ -1420,60 +1431,49 @@ define void @urem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 
 ; Vector i64 udiv are not legal for NEON so use SVE when available.
 ; FIXME: We should be able to improve the codegen for the 128 bits case here.
-define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: urem_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, z1.d
-; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d
-; CHECK-NEXT: sub d0, d0, d1
-; CHECK-NEXT: ret
-
-; VBITS_EQ_128-LABEL: urem_v1i64:
-; VBITS_EQ_128:         ptrue p0.d, vl1
-; VBITS_EQ_128-NEXT:    movprfx z2, z0
-; VBITS_EQ_128-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z2.d
-; VBITS_EQ_128-NEXT:    sub d0, d0, d1
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    sub d0, d0, d1
+; CHECK-NEXT:    ret
   %res = urem <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Vector i64 udiv are not legal for NEON so use SVE when available.
 ; FIXME: We should be able to improve the codegen for the 128 bits case here.
-define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: urem_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, z1.d
-; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d
-; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
-
-; VBITS_EQ_128-LABEL: urem_v2i64:
-; VBITS_EQ_128:         ptrue p0.d, vl2
-; VBITS_EQ_128-NEXT:    movprfx z2, z0
-; VBITS_EQ_128-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z2.d
-; VBITS_EQ_128-NEXT:    sub v0.2d, v0.2d, v1.2d
-; VBITS_EQ_128-NEXT:    ret
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = urem <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @urem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @urem_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: urem_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; CHECK-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; CHECK-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = urem <4 x i64> %op1, %op2
@@ -1482,15 +1482,61 @@ define void @urem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @urem_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: urem_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_512-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_128-LABEL: urem_v8i64:
+; VBITS_GE_128:       // %bb.0:
+; VBITS_GE_128-NEXT:    ldp q4, q5, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
+; VBITS_GE_128-NEXT:    ldp q7, q6, [x1, #32]
+; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    movprfx z16, z3
+; VBITS_GE_128-NEXT:    udiv z16.d, p0/m, z16.d, z5.d
+; VBITS_GE_128-NEXT:    movprfx z17, z2
+; VBITS_GE_128-NEXT:    udiv z17.d, p0/m, z17.d, z4.d
+; VBITS_GE_128-NEXT:    mul z5.d, p0/m, z5.d, z16.d
+; VBITS_GE_128-NEXT:    movprfx z16, z1
+; VBITS_GE_128-NEXT:    udiv z16.d, p0/m, z16.d, z6.d
+; VBITS_GE_128-NEXT:    mul z4.d, p0/m, z4.d, z17.d
+; VBITS_GE_128-NEXT:    movprfx z17, z0
+; VBITS_GE_128-NEXT:    udiv z17.d, p0/m, z17.d, z7.d
+; VBITS_GE_128-NEXT:    mul z6.d, p0/m, z6.d, z16.d
+; VBITS_GE_128-NEXT:    mul z7.d, p0/m, z7.d, z17.d
+; VBITS_GE_128-NEXT:    sub v0.2d, v0.2d, v7.2d
+; VBITS_GE_128-NEXT:    sub v1.2d, v1.2d, v6.2d
+; VBITS_GE_128-NEXT:    sub v2.2d, v2.2d, v4.2d
+; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    sub v0.2d, v3.2d, v5.2d
+; VBITS_GE_128-NEXT:    stp q2, q0, [x0]
+; VBITS_GE_128-NEXT:    ret
+;
+; VBITS_GE_256-LABEL: urem_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    movprfx z4, z0
+; VBITS_GE_256-NEXT:    udiv z4.d, p0/m, z4.d, z2.d
+; VBITS_GE_256-NEXT:    movprfx z5, z1
+; VBITS_GE_256-NEXT:    udiv z5.d, p0/m, z5.d, z3.d
+; VBITS_GE_256-NEXT:    mls z0.d, p0/m, z4.d, z2.d
+; VBITS_GE_256-NEXT:    mls z1.d, p0/m, z5.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: urem_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    movprfx z2, z0
+; VBITS_GE_512-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
+; VBITS_GE_512-NEXT:    mls z0.d, p0/m, z2.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = urem <8 x i64> %op1, %op2
@@ -1498,16 +1544,17 @@ define void @urem_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @urem_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @urem_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: urem_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_1024-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = urem <16 x i64> %op1, %op2
@@ -1515,16 +1562,17 @@ define void @urem_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @urem_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @urem_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: urem_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]]
-; VBITS_GE_2048-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[OP1]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = urem <32 x i64> %op1, %op2

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
index 639b4a96e364..8b76c00631bb 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
@@ -1,62 +1,50 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) #0 {
+define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8i8:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: dup v2.8b, w8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    dup v2.8b, w8
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
   %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) #0 {
+define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v16i8:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: dup v2.16b, w8
-; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    dup v2.16b, w8
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
   %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
 }
 
-define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) #0 {
+define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v32i8:
-; CHECK: and w[[AND:[0-9]+]], w2, #0x1
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
-; CHECK-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].b
-; CHECK-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
-; CHECK-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
-; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
-; CHECK-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    mov z2.b, w8
+; CHECK-NEXT:    and z2.b, z2.b, #0x1
+; CHECK-NEXT:    cmpne p1.b, p1/z, z2.b, #0
+; CHECK-NEXT:    sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <32 x i8>, <32 x i8>* %a
   %op2 = load volatile <32 x i8>, <32 x i8>* %b
   %sel = select i1 %mask, <32 x i8> %op1, <32 x i8> %op2
@@ -65,18 +53,38 @@ define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) #0 {
 }
 
 define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, i1 %mask) #0 {
-; CHECK-LABEL: select_v64i8:
-; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
-; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].b
-; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
-; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
-; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
-; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: select_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    and w9, w2, #0x1
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ptrue p1.b
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov z4.b, w9
+; VBITS_GE_256-NEXT:    and z4.b, z4.b, #0x1
+; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z4.b, #0
+; VBITS_GE_256-NEXT:    sel z1.b, p1, z1.b, z3.b
+; VBITS_GE_256-NEXT:    sel z0.b, p1, z0.b, z2.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: select_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    and w8, w2, #0x1
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ptrue p1.b
+; VBITS_GE_512-NEXT:    mov z2.b, w8
+; VBITS_GE_512-NEXT:    and z2.b, z2.b, #0x1
+; VBITS_GE_512-NEXT:    cmpne p1.b, p1/z, z2.b, #0
+; VBITS_GE_512-NEXT:    sel z0.b, p1, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <64 x i8>, <64 x i8>* %a
   %op2 = load volatile <64 x i8>, <64 x i8>* %b
   %sel = select i1 %mask, <64 x i8> %op1, <64 x i8> %op2
@@ -84,19 +92,20 @@ define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) #0 {
+define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v128i8:
-; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
-; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].b
-; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
-; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
-; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
-; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    mov z2.b, w8
+; CHECK-NEXT:    and z2.b, z2.b, #0x1
+; CHECK-NEXT:    cmpne p1.b, p1/z, z2.b, #0
+; CHECK-NEXT:    sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <128 x i8>, <128 x i8>* %a
   %op2 = load volatile <128 x i8>, <128 x i8>* %b
   %sel = select i1 %mask, <128 x i8> %op1, <128 x i8> %op2
@@ -104,19 +113,20 @@ define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) #0 {
+define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v256i8:
-; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
-; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].b
-; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
-; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
-; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
-; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    mov z2.b, w8
+; CHECK-NEXT:    and z2.b, z2.b, #0x1
+; CHECK-NEXT:    cmpne p1.b, p1/z, z2.b, #0
+; CHECK-NEXT:    sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <256 x i8>, <256 x i8>* %a
   %op2 = load volatile <256 x i8>, <256 x i8>* %b
   %sel = select i1 %mask, <256 x i8> %op1, <256 x i8> %op2
@@ -125,42 +135,45 @@ define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) #0 {
+define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4i16:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: dup v2.4h, w8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    dup v2.4h, w8
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
   %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) #0 {
+define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8i16:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: dup v2.8h, w8
-; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    dup v2.8h, w8
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
   %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
 }
 
-define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) #0 {
+define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v16i16:
-; CHECK: and w[[AND:[0-9]+]], w2, #0x1
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
-; CHECK-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h
-; CHECK-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
-; CHECK-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
-; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
-; CHECK-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    and z2.h, z2.h, #0x1
+; CHECK-NEXT:    cmpne p1.h, p1/z, z2.h, #0
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <16 x i16>, <16 x i16>* %a
   %op2 = load volatile <16 x i16>, <16 x i16>* %b
   %sel = select i1 %mask, <16 x i16> %op1, <16 x i16> %op2
@@ -169,18 +182,38 @@ define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) #0 {
 }
 
 define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, i1 %mask) #0 {
-; CHECK-LABEL: select_v32i16:
-; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
-; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h
-; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
-; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
-; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
-; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: select_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    and w9, w2, #0x1
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ptrue p1.h
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov z4.h, w9
+; VBITS_GE_256-NEXT:    and z4.h, z4.h, #0x1
+; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z4.h, #0
+; VBITS_GE_256-NEXT:    sel z1.h, p1, z1.h, z3.h
+; VBITS_GE_256-NEXT:    sel z0.h, p1, z0.h, z2.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: select_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    and w8, w2, #0x1
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ptrue p1.h
+; VBITS_GE_512-NEXT:    mov z2.h, w8
+; VBITS_GE_512-NEXT:    and z2.h, z2.h, #0x1
+; VBITS_GE_512-NEXT:    cmpne p1.h, p1/z, z2.h, #0
+; VBITS_GE_512-NEXT:    sel z0.h, p1, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <32 x i16>, <32 x i16>* %a
   %op2 = load volatile <32 x i16>, <32 x i16>* %b
   %sel = select i1 %mask, <32 x i16> %op1, <32 x i16> %op2
@@ -188,19 +221,20 @@ define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) #0 {
+define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v64i16:
-; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
-; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].h
-; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
-; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
-; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
-; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    and z2.h, z2.h, #0x1
+; CHECK-NEXT:    cmpne p1.h, p1/z, z2.h, #0
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <64 x i16>, <64 x i16>* %a
   %op2 = load volatile <64 x i16>, <64 x i16>* %b
   %sel = select i1 %mask, <64 x i16> %op1, <64 x i16> %op2
@@ -208,19 +242,20 @@ define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) #0 {
+define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v128i16:
-; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
-; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].h
-; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
-; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
-; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
-; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    and z2.h, z2.h, #0x1
+; CHECK-NEXT:    cmpne p1.h, p1/z, z2.h, #0
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <128 x i16>, <128 x i16>* %a
   %op2 = load volatile <128 x i16>, <128 x i16>* %b
   %sel = select i1 %mask, <128 x i16> %op1, <128 x i16> %op2
@@ -229,42 +264,45 @@ define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) #0 {
+define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v2i32:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: dup v2.2s, w8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    dup v2.2s, w8
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
   %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) #0 {
+define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4i32:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: dup v2.4s, w8
-; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    dup v2.4s, w8
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
   %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
 }
 
-define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) #0 {
+define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8i32:
-; CHECK: and w[[AND:[0-9]+]], w2, #0x1
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s
-; CHECK-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
-; CHECK-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
-; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
-; CHECK-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    and z2.s, z2.s, #0x1
+; CHECK-NEXT:    cmpne p1.s, p1/z, z2.s, #0
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <8 x i32>, <8 x i32>* %a
   %op2 = load volatile <8 x i32>, <8 x i32>* %b
   %sel = select i1 %mask, <8 x i32> %op1, <8 x i32> %op2
@@ -273,18 +311,38 @@ define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) #0 {
 }
 
 define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, i1 %mask) #0 {
-; CHECK-LABEL: select_v16i32:
-; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
-; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
-; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
-; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: select_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    and w9, w2, #0x1
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ptrue p1.s
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov z4.s, w9
+; VBITS_GE_256-NEXT:    and z4.s, z4.s, #0x1
+; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z4.s, #0
+; VBITS_GE_256-NEXT:    sel z1.s, p1, z1.s, z3.s
+; VBITS_GE_256-NEXT:    sel z0.s, p1, z0.s, z2.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: select_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    and w8, w2, #0x1
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ptrue p1.s
+; VBITS_GE_512-NEXT:    mov z2.s, w8
+; VBITS_GE_512-NEXT:    and z2.s, z2.s, #0x1
+; VBITS_GE_512-NEXT:    cmpne p1.s, p1/z, z2.s, #0
+; VBITS_GE_512-NEXT:    sel z0.s, p1, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <16 x i32>, <16 x i32>* %a
   %op2 = load volatile <16 x i32>, <16 x i32>* %b
   %sel = select i1 %mask, <16 x i32> %op1, <16 x i32> %op2
@@ -292,19 +350,20 @@ define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) #0 {
+define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v32i32:
-; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
-; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
-; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
-; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    and z2.s, z2.s, #0x1
+; CHECK-NEXT:    cmpne p1.s, p1/z, z2.s, #0
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <32 x i32>, <32 x i32>* %a
   %op2 = load volatile <32 x i32>, <32 x i32>* %b
   %sel = select i1 %mask, <32 x i32> %op1, <32 x i32> %op2
@@ -312,19 +371,20 @@ define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) #0 {
+define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v64i32:
-; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s
-; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
-; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
-; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
-; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    and z2.s, z2.s, #0x1
+; CHECK-NEXT:    cmpne p1.s, p1/z, z2.s, #0
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <64 x i32>, <64 x i32>* %a
   %op2 = load volatile <64 x i32>, <64 x i32>* %b
   %sel = select i1 %mask, <64 x i32> %op1, <64 x i32> %op2
@@ -333,42 +393,45 @@ define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) #0 {
+define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v1i64:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm x8, ne
-; CHECK-NEXT: fmov d2, x8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    csetm x8, ne
+; CHECK-NEXT:    fmov d2, x8
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
   %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) #0 {
+define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v2i64:
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: csetm x8, ne
-; CHECK-NEXT: dup v2.2d, x8
-; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    csetm x8, ne
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
   %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
 }
 
-define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) #0 {
+define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4i64:
-; CHECK: and w[[AND:[0-9]+]], w2, #0x1
-; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
-; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
-; CHECK-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
-; CHECK-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
-; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
-; CHECK-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z2.d, z2.d, #0x1
+; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <4 x i64>, <4 x i64>* %a
   %op2 = load volatile <4 x i64>, <4 x i64>* %b
   %sel = select i1 %mask, <4 x i64> %op1, <4 x i64> %op2
@@ -377,18 +440,38 @@ define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) #0 {
 }
 
 define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, i1 %mask) #0 {
-; CHECK-LABEL: select_v8i64:
-; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
-; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
-; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
-; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
-; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: select_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    and w9, w2, #0x1
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ptrue p1.d
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    mov z4.d, x9
+; VBITS_GE_256-NEXT:    and z4.d, z4.d, #0x1
+; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z4.d, #0
+; VBITS_GE_256-NEXT:    sel z1.d, p1, z1.d, z3.d
+; VBITS_GE_256-NEXT:    sel z0.d, p1, z0.d, z2.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: select_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    and w8, w2, #0x1
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ptrue p1.d
+; VBITS_GE_512-NEXT:    mov z2.d, x8
+; VBITS_GE_512-NEXT:    and z2.d, z2.d, #0x1
+; VBITS_GE_512-NEXT:    cmpne p1.d, p1/z, z2.d, #0
+; VBITS_GE_512-NEXT:    sel z0.d, p1, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <8 x i64>, <8 x i64>* %a
   %op2 = load volatile <8 x i64>, <8 x i64>* %b
   %sel = select i1 %mask, <8 x i64> %op1, <8 x i64> %op2
@@ -396,19 +479,20 @@ define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) #0 {
+define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v16i64:
-; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
-; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
-; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
-; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
-; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z2.d, z2.d, #0x1
+; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <16 x i64>, <16 x i64>* %a
   %op2 = load volatile <16 x i64>, <16 x i64>* %b
   %sel = select i1 %mask, <16 x i64> %op1, <16 x i64> %op2
@@ -416,19 +500,20 @@ define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) #0 {
   ret void
 }
 
-define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, i1 %mask) #0 {
+define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v32i64:
-; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1
-; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
-; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
-; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
-; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
-; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
-; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z2.d, z2.d, #0x1
+; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load volatile <32 x i64>, <32 x i64>* %a
   %op2 = load volatile <32 x i64>, <32 x i64>* %b
   %sel = select i1 %mask, <32 x i64> %op1, <32 x i64> %op2

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
index 4cbc916a59cd..23e37d3c8ad0 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
@@ -1,57 +1,45 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; ASHR
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ashr_v8i8:
-; CHECK: neg v1.8b, v1.8b
-; CHECK-NEXT: sshl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8b, v1.8b
+; CHECK-NEXT:    sshl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = ashr <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ashr_v16i8:
-; CHECK: neg v1.16b, v1.16b
-; CHECK-NEXT: sshl v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.16b, v1.16b
+; CHECK-NEXT:    sshl v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = ashr <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @ashr_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @ashr_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ashr_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: asr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = ashr <32 x i8> %op1, %op2
@@ -60,26 +48,28 @@ define void @ashr_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @ashr_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: ashr_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: asr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: asr [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: asr [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ashr_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    asr z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT:    asr z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: ashr_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    asr z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = ashr <64 x i8> %op1, %op2
@@ -87,14 +77,15 @@ define void @ashr_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @ashr_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @ashr_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: ashr_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: asr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = ashr <128 x i8> %op1, %op2
@@ -102,14 +93,15 @@ define void @ashr_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @ashr_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @ashr_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: ashr_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: asr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = ashr <256 x i8> %op1, %op2
@@ -118,33 +110,36 @@ define void @ashr_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ashr_v4i16:
-; CHECK: neg v1.4h, v1.4h
-; CHECK-NEXT: sshl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.4h, v1.4h
+; CHECK-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = ashr <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ashr_v8i16:
-; CHECK: neg v1.8h, v1.8h
-; CHECK-NEXT: sshl v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8h, v1.8h
+; CHECK-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = ashr <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
-define void @ashr_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @ashr_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ashr_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: asr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = ashr <16 x i16> %op1, %op2
@@ -153,26 +148,28 @@ define void @ashr_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @ashr_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: ashr_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: asr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: asr [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: asr [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ashr_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    asr z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    asr z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: ashr_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    asr z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = ashr <32 x i16> %op1, %op2
@@ -180,14 +177,15 @@ define void @ashr_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @ashr_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @ashr_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: ashr_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: asr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = ashr <64 x i16> %op1, %op2
@@ -195,14 +193,15 @@ define void @ashr_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @ashr_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @ashr_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: ashr_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: asr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = ashr <128 x i16> %op1, %op2
@@ -211,33 +210,36 @@ define void @ashr_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ashr_v2i32:
-; CHECK: neg v1.2s, v1.2s
-; CHECK-NEXT: sshl v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.2s, v1.2s
+; CHECK-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = ashr <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ashr_v4i32:
-; CHECK: neg v1.4s, v1.4s
-; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.4s, v1.4s
+; CHECK-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = ashr <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @ashr_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @ashr_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ashr_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: asr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = ashr <8 x i32> %op1, %op2
@@ -246,26 +248,28 @@ define void @ashr_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @ashr_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: ashr_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: asr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: asr [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: asr [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ashr_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    asr z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    asr z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: ashr_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    asr z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = ashr <16 x i32> %op1, %op2
@@ -273,14 +277,15 @@ define void @ashr_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @ashr_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @ashr_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: ashr_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: asr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = ashr <32 x i32> %op1, %op2
@@ -288,14 +293,15 @@ define void @ashr_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @ashr_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @ashr_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: ashr_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: asr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = ashr <64 x i32> %op1, %op2
@@ -304,33 +310,36 @@ define void @ashr_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ashr_v1i64:
-; CHECK: neg d1, d1
-; CHECK-NEXT: sshl d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg d1, d1
+; CHECK-NEXT:    sshl d0, d0, d1
+; CHECK-NEXT:    ret
   %res = ashr <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ashr_v2i64:
-; CHECK: neg v1.2d, v1.2d
-; CHECK-NEXT: sshl v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.2d, v1.2d
+; CHECK-NEXT:    sshl v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = ashr <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @ashr_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @ashr_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ashr_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: asr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = ashr <4 x i64> %op1, %op2
@@ -339,26 +348,28 @@ define void @ashr_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @ashr_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: ashr_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: asr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: asr [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: asr [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: ashr_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    asr z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    asr z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: ashr_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    asr z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = ashr <8 x i64> %op1, %op2
@@ -366,14 +377,15 @@ define void @ashr_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @ashr_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @ashr_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: ashr_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: asr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = ashr <16 x i64> %op1, %op2
@@ -381,14 +393,15 @@ define void @ashr_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @ashr_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @ashr_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: ashr_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: asr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = ashr <32 x i64> %op1, %op2
@@ -401,33 +414,36 @@ define void @ashr_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: lshr_v8i8:
-; CHECK: neg v1.8b, v1.8b
-; CHECK-NEXT: ushl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8b, v1.8b
+; CHECK-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = lshr <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: lshr_v16i8:
-; CHECK: neg v1.16b, v1.16b
-; CHECK-NEXT: ushl v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.16b, v1.16b
+; CHECK-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = lshr <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @lshr_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @lshr_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: lshr_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = lshr <32 x i8> %op1, %op2
@@ -436,26 +452,28 @@ define void @lshr_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @lshr_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: lshr_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: lsr [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: lsr [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: lshr_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    lsr z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT:    lsr z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: lshr_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = lshr <64 x i8> %op1, %op2
@@ -463,14 +481,15 @@ define void @lshr_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @lshr_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @lshr_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: lshr_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = lshr <128 x i8> %op1, %op2
@@ -478,14 +497,15 @@ define void @lshr_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @lshr_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @lshr_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: lshr_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsr [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = lshr <256 x i8> %op1, %op2
@@ -494,33 +514,36 @@ define void @lshr_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: lshr_v4i16:
-; CHECK: neg v1.4h, v1.4h
-; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.4h, v1.4h
+; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = lshr <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: lshr_v8i16:
-; CHECK: neg v1.8h, v1.8h
-; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8h, v1.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = lshr <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
-define void @lshr_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @lshr_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: lshr_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = lshr <16 x i16> %op1, %op2
@@ -529,26 +552,28 @@ define void @lshr_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @lshr_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: lshr_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: lsr [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: lsr [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: lshr_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    lsr z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    lsr z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: lshr_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = lshr <32 x i16> %op1, %op2
@@ -556,14 +581,15 @@ define void @lshr_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @lshr_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @lshr_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: lshr_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = lshr <64 x i16> %op1, %op2
@@ -571,14 +597,15 @@ define void @lshr_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @lshr_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @lshr_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: lshr_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsr [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = lshr <128 x i16> %op1, %op2
@@ -587,33 +614,36 @@ define void @lshr_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: lshr_v2i32:
-; CHECK: neg v1.2s, v1.2s
-; CHECK-NEXT: ushl v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.2s, v1.2s
+; CHECK-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = lshr <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: lshr_v4i32:
-; CHECK: neg v1.4s, v1.4s
-; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.4s, v1.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = lshr <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @lshr_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @lshr_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: lshr_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = lshr <8 x i32> %op1, %op2
@@ -622,26 +652,28 @@ define void @lshr_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @lshr_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: lshr_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: lsr [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: lsr [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: lshr_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    lsr z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    lsr z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: lshr_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = lshr <16 x i32> %op1, %op2
@@ -649,14 +681,15 @@ define void @lshr_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @lshr_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @lshr_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: lshr_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = lshr <32 x i32> %op1, %op2
@@ -664,14 +697,15 @@ define void @lshr_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @lshr_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @lshr_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: lshr_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsr [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = lshr <64 x i32> %op1, %op2
@@ -680,33 +714,36 @@ define void @lshr_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: lshr_v1i64:
-; CHECK: neg d1, d1
-; CHECK-NEXT: ushl d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg d1, d1
+; CHECK-NEXT:    ushl d0, d0, d1
+; CHECK-NEXT:    ret
   %res = lshr <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: lshr_v2i64:
-; CHECK: neg v1.2d, v1.2d
-; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.2d, v1.2d
+; CHECK-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = lshr <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @lshr_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @lshr_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: lshr_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = lshr <4 x i64> %op1, %op2
@@ -715,26 +752,28 @@ define void @lshr_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @lshr_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: lshr_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: lsr [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: lsr [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: lshr_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: lshr_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = lshr <8 x i64> %op1, %op2
@@ -742,14 +781,15 @@ define void @lshr_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @lshr_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @lshr_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: lshr_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = lshr <16 x i64> %op1, %op2
@@ -757,14 +797,15 @@ define void @lshr_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @lshr_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @lshr_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: lshr_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsr [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = lshr <32 x i64> %op1, %op2
@@ -777,31 +818,34 @@ define void @lshr_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shl_v8i8:
-; CHECK: ushl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %res = shl <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shl_v16i8:
-; CHECK: ushl v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %res = shl <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
 
-define void @shl_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @shl_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shl_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsl [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
   %res = shl <32 x i8> %op1, %op2
@@ -810,26 +854,28 @@ define void @shl_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @shl_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; CHECK-LABEL: shl_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsl [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: lsl [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
-; VBITS_EQ_256-DAG: lsl [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: shl_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    lsl z0.b, p0/m, z0.b, z2.b
+; VBITS_GE_256-NEXT:    lsl z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: shl_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %res = shl <64 x i8> %op1, %op2
@@ -837,14 +883,15 @@ define void @shl_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @shl_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+define void @shl_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: shl_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsl [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %res = shl <128 x i8> %op1, %op2
@@ -852,14 +899,15 @@ define void @shl_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @shl_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+define void @shl_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: shl_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsl [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %res = shl <256 x i8> %op1, %op2
@@ -868,31 +916,34 @@ define void @shl_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shl_v4i16:
-; CHECK: ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %res = shl <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shl_v8i16:
-; CHECK: ushl v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %res = shl <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
 
-define void @shl_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @shl_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shl_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsl [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = shl <16 x i16> %op1, %op2
@@ -901,26 +952,28 @@ define void @shl_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @shl_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; CHECK-LABEL: shl_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsl [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: lsl [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
-; VBITS_EQ_256-DAG: lsl [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: shl_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    lsl z0.h, p0/m, z0.h, z2.h
+; VBITS_GE_256-NEXT:    lsl z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: shl_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
   %res = shl <32 x i16> %op1, %op2
@@ -928,14 +981,15 @@ define void @shl_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @shl_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+define void @shl_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: shl_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsl [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %res = shl <64 x i16> %op1, %op2
@@ -943,14 +997,15 @@ define void @shl_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @shl_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+define void @shl_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: shl_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsl [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %res = shl <128 x i16> %op1, %op2
@@ -959,31 +1014,34 @@ define void @shl_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shl_v2i32:
-; CHECK: ushl v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %res = shl <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shl_v4i32:
-; CHECK: ushl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %res = shl <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
 
-define void @shl_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @shl_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shl_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsl [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = shl <8 x i32> %op1, %op2
@@ -992,26 +1050,28 @@ define void @shl_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @shl_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; CHECK-LABEL: shl_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsl [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: lsl [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
-; VBITS_EQ_256-DAG: lsl [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: shl_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    lsl z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_256-NEXT:    lsl z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: shl_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
   %res = shl <16 x i32> %op1, %op2
@@ -1019,14 +1079,15 @@ define void @shl_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @shl_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+define void @shl_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: shl_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsl [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %res = shl <32 x i32> %op1, %op2
@@ -1034,14 +1095,15 @@ define void @shl_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @shl_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+define void @shl_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: shl_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsl [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %res = shl <64 x i32> %op1, %op2
@@ -1050,31 +1112,34 @@ define void @shl_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shl_v1i64:
-; CHECK: ushl d0, d0, d1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl d0, d0, d1
+; CHECK-NEXT:    ret
   %res = shl <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shl_v2i64:
-; CHECK: ushl v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
   %res = shl <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
 
-define void @shl_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @shl_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shl_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; CHECK-NEXT: lsl [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
   %res = shl <4 x i64> %op1, %op2
@@ -1083,26 +1148,28 @@ define void @shl_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @shl_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; CHECK-LABEL: shl_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_512-NEXT: lsl [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: lsl [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
-; VBITS_EQ_256-DAG: lsl [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: shl_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    lsl z0.d, p0/m, z0.d, z2.d
+; VBITS_GE_256-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: shl_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
   %res = shl <8 x i64> %op1, %op2
@@ -1110,14 +1177,15 @@ define void @shl_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @shl_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+define void @shl_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: shl_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_1024-NEXT: lsl [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %res = shl <16 x i64> %op1, %op2
@@ -1125,14 +1193,15 @@ define void @shl_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @shl_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+define void @shl_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: shl_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
-; VBITS_GE_2048-NEXT: lsl [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %res = shl <32 x i64> %op1, %op2

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index 7de5eefd74db..a0c4b4313917 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -1,21 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) #0 {
+define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v4i16_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ucvtf v0.4h, v0.4h
@@ -34,7 +20,7 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define void @ucvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) #0 {
+define void @ucvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v8i16_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -47,7 +33,7 @@ define void @ucvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) #0 {
   ret void
 }
 
-define void @ucvtf_v16i16_v16f16(<16 x i16>* %a, <16 x half>* %b) #0 {
+define void @ucvtf_v16i16_v16f16(<16 x i16>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v16i16_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -87,84 +73,28 @@ define void @ucvtf_v32i16_v32f16(<32 x i16>* %a, <32 x half>* %b) #0 {
   ret void
 }
 
-define void @ucvtf_v64i16_v64f16(<64 x i16>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v64i16_v64f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #48
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #32
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ucvtf z1.h, p0/m, z1.h
-; VBITS_GE_256-NEXT:    ucvtf z0.h, p0/m, z0.h
-; VBITS_GE_256-NEXT:    ucvtf z2.h, p0/m, z2.h
-; VBITS_GE_256-NEXT:    ucvtf z3.h, p0/m, z3.h
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v64i16_v64f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ucvtf z0.h, p0/m, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @ucvtf_v64i16_v64f16(<64 x i16>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v64i16_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %res = uitofp <64 x i16> %op1 to <64 x half>
   store <64 x half> %res, <64 x half>* %b
   ret void
 }
 
-define void @ucvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v128i16_v128f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #96
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x11, #80
-; VBITS_GE_256-NEXT:    mov x12, #32
-; VBITS_GE_256-NEXT:    mov x13, #112
-; VBITS_GE_256-NEXT:    mov x14, #64
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z6.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z7.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ucvtf z1.h, p0/m, z1.h
-; VBITS_GE_256-NEXT:    ucvtf z0.h, p0/m, z0.h
-; VBITS_GE_256-NEXT:    ucvtf z3.h, p0/m, z3.h
-; VBITS_GE_256-NEXT:    ucvtf z2.h, p0/m, z2.h
-; VBITS_GE_256-NEXT:    ucvtf z5.h, p0/m, z5.h
-; VBITS_GE_256-NEXT:    ucvtf z4.h, p0/m, z4.h
-; VBITS_GE_256-NEXT:    ucvtf z6.h, p0/m, z6.h
-; VBITS_GE_256-NEXT:    ucvtf z7.h, p0/m, z7.h
-; VBITS_GE_256-NEXT:    st1h { z6.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z5.h }, p0, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z7.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v128i16_v128f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ucvtf z0.h, p0/m, z0.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @ucvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v128i16_v128f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %res = uitofp <128 x i16> %op1 to <128 x half>
   store <128 x half> %res, <128 x half>* %b
@@ -176,7 +106,7 @@ define void @ucvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) #0 {
+define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v2i16_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
@@ -188,7 +118,7 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) #0 {
+define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v4i16_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
@@ -198,7 +128,7 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) #0 {
   ret <4 x float> %res
 }
 
-define void @ucvtf_v8i16_v8f32(<8 x i16>* %a, <8 x float>* %b) #0 {
+define void @ucvtf_v8i16_v8f32(<8 x i16>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v8i16_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -244,102 +174,32 @@ define void @ucvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @ucvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i16_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    ucvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    ucvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT:    ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT:    ucvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v32i16_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @ucvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v32i16_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %res = uitofp <32 x i16> %op1 to <32 x float>
   store <32 x float> %res, <32 x float>* %b
   ret void
 }
 
-define void @ucvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v64i16_v64f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #32
-; VBITS_GE_256-NEXT:    mov x10, #48
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x11, #24
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    uunpklo z4.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z5.s, z1.h
-; VBITS_GE_256-NEXT:    uunpklo z6.s, z2.h
-; VBITS_GE_256-NEXT:    ucvtf z4.s, p0/m, z4.s
-; VBITS_GE_256-NEXT:    ucvtf z5.s, p0/m, z5.s
-; VBITS_GE_256-NEXT:    ucvtf z6.s, p0/m, z6.s
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z7.s, z3.h
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    mov x9, #56
-; VBITS_GE_256-NEXT:    mov x10, #40
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    ucvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT:    ucvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT:    ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT:    ucvtf z7.s, p0/m, z7.s
-; VBITS_GE_256-NEXT:    ucvtf z3.s, p0/m, z3.s
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v64i16_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT:    ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @ucvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v64i16_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %res = uitofp <64 x i16> %op1 to <64 x float>
   store <64 x float> %res, <64 x float>* %b
@@ -351,7 +211,7 @@ define void @ucvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) #0 {
 ;
 
 ; v1i16 is perfered to be widened to v4i16, which pushes the output into SVE types, so use SVE
-define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) #0 {
+define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v1i16_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -366,7 +226,7 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) #0 {
+define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v2i16_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
@@ -378,7 +238,7 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) #0 {
   ret <2 x double> %res
 }
 
-define void @ucvtf_v4i16_v4f64(<4 x i16>* %a, <4 x double>* %b) #0 {
+define void @ucvtf_v4i16_v4f64(<4 x i16>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v4i16_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -426,119 +286,34 @@ define void @ucvtf_v8i16_v8f64(<8 x i16>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @ucvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v16i16_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    mov z1.d, z0.d
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z0.h
-; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z3.h
-; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z2.s
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v16i16_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @ucvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v16i16_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %res = uitofp <16 x i16> %op1 to <16 x double>
   store <16 x double> %res, <16 x double>* %b
   ret void
 }
 
-define void @ucvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i16_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x9, #20
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    ucvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT:    uunpklo z4.s, z1.h
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    ucvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT:    ext v6.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    uunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT:    ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    uunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    uunpklo z6.s, z6.h
-; VBITS_GE_256-NEXT:    movprfx z0, z5
-; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z5.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #28
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z6.s
-; VBITS_GE_256-NEXT:    ucvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT:    ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    uunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT:    movprfx z0, z4
-; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z4.d
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v32i16_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @ucvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v32i16_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %res = uitofp <32 x i16> %op1 to <32 x double>
   store <32 x double> %res, <32 x double>* %b
@@ -550,7 +325,7 @@ define void @ucvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) #0 {
+define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v2i32_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -562,7 +337,7 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) #0 {
+define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v4i32_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ucvtf v0.4s, v0.4s
@@ -572,7 +347,7 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) #0 {
   ret <4 x half> %res
 }
 
-define <8 x half> @ucvtf_v8i32_v8f16(<8 x i32>* %a) #0 {
+define <8 x half> @ucvtf_v8i32_v8f16(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v8i32_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -621,110 +396,34 @@ define void @ucvtf_v16i32_v16f16(<16 x i32>* %a, <16 x half>* %b) #0 {
   ret void
 }
 
-define void @ucvtf_v32i32_v32f16(<32 x i32>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i32_v32f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ptrue p1.s
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_256-NEXT:    ucvtf z0.h, p1/m, z0.s
-; VBITS_GE_256-NEXT:    ucvtf z2.h, p1/m, z2.s
-; VBITS_GE_256-NEXT:    ucvtf z1.h, p1/m, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    ucvtf z3.h, p1/m, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    splice z2.h, p0, z2.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    splice z0.h, p0, z0.h, z1.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v32i32_v32f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.s
-; VBITS_GE_1024-NEXT:    ucvtf z0.h, p0/m, z0.s
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @ucvtf_v32i32_v32f16(<32 x i32>* %a, <32 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v32i32_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %res = uitofp <32 x i32> %op1 to <32 x half>
   store <32 x half> %res, <32 x half>* %b
   ret void
 }
 
-define void @ucvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v64i32_v64f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x11, #16
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x12, #48
-; VBITS_GE_256-NEXT:    mov x10, #32
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #40
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    mov x9, #56
-; VBITS_GE_256-NEXT:    ptrue p1.s
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ucvtf z1.h, p1/m, z1.s
-; VBITS_GE_256-NEXT:    ucvtf z2.h, p1/m, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    ptrue p2.h, vl8
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    splice z2.h, p2, z2.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z6
-; VBITS_GE_256-NEXT:    ucvtf z1.h, p1/m, z6.s
-; VBITS_GE_256-NEXT:    ucvtf z5.h, p1/m, z5.s
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    uzp1 z5.h, z5.h, z5.h
-; VBITS_GE_256-NEXT:    ucvtf z3.h, p1/m, z3.s
-; VBITS_GE_256-NEXT:    ucvtf z4.h, p1/m, z4.s
-; VBITS_GE_256-NEXT:    splice z5.h, p2, z5.h, z1.h
-; VBITS_GE_256-NEXT:    ucvtf z0.h, p1/m, z0.s
-; VBITS_GE_256-NEXT:    movprfx z1, z7
-; VBITS_GE_256-NEXT:    ucvtf z1.h, p1/m, z7.s
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    splice z4.h, p2, z4.h, z3.h
-; VBITS_GE_256-NEXT:    splice z1.h, p2, z1.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z5.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v64i32_v64f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.s
-; VBITS_GE_2048-NEXT:    ucvtf z0.h, p0/m, z0.s
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @ucvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v64i32_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %res = uitofp <64 x i32> %op1 to <64 x half>
   store <64 x half> %res, <64 x half>* %b
@@ -736,7 +435,7 @@ define void @ucvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) #0 {
+define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v2i32_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ucvtf v0.2s, v0.2s
@@ -746,7 +445,7 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) #0 {
+define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v4i32_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ucvtf v0.4s, v0.4s
@@ -755,7 +454,7 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) #0 {
   ret <4 x float> %res
 }
 
-define void @ucvtf_v8i32_v8f32(<8 x i32>* %a, <8 x float>* %b) #0 {
+define void @ucvtf_v8i32_v8f32(<8 x i32>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v8i32_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -795,84 +494,28 @@ define void @ucvtf_v16i32_v16f32(<16 x i32>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @ucvtf_v32i32_v32f32(<32 x i32>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i32_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ucvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT:    ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT:    ucvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT:    ucvtf z3.s, p0/m, z3.s
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v32i32_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @ucvtf_v32i32_v32f32(<32 x i32>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v32i32_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %res = uitofp <32 x i32> %op1 to <32 x float>
   store <32 x float> %res, <32 x float>* %b
   ret void
 }
 
-define void @ucvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v64i32_v64f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #48
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    mov x11, #40
-; VBITS_GE_256-NEXT:    mov x12, #16
-; VBITS_GE_256-NEXT:    mov x13, #56
-; VBITS_GE_256-NEXT:    mov x14, #32
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ucvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT:    ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT:    ucvtf z3.s, p0/m, z3.s
-; VBITS_GE_256-NEXT:    ucvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT:    ucvtf z5.s, p0/m, z5.s
-; VBITS_GE_256-NEXT:    ucvtf z4.s, p0/m, z4.s
-; VBITS_GE_256-NEXT:    ucvtf z6.s, p0/m, z6.s
-; VBITS_GE_256-NEXT:    ucvtf z7.s, p0/m, z7.s
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v64i32_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @ucvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v64i32_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %res = uitofp <64 x i32> %op1 to <64 x float>
   store <64 x float> %res, <64 x float>* %b
@@ -884,7 +527,7 @@ define void @ucvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @ucvtf_v1i32_v1f64(<1 x i32> %op1) #0 {
+define <1 x double> @ucvtf_v1i32_v1f64(<1 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v1i32_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
@@ -896,7 +539,7 @@ define <1 x double> @ucvtf_v1i32_v1f64(<1 x i32> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) #0 {
+define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v2i32_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
@@ -906,7 +549,7 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) #0 {
   ret <2 x double> %res
 }
 
-define void @ucvtf_v4i32_v4f64(<4 x i32>* %a, <4 x double>* %b) #0 {
+define void @ucvtf_v4i32_v4f64(<4 x i32>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v4i32_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -952,102 +595,32 @@ define void @ucvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @ucvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    uunpklo z2.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    ucvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    uunpklo z2.d, z1.s
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    ucvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v16i32_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @ucvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v16i32_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %res = uitofp <16 x i32> %op1 to <16 x double>
   store <16 x double> %res, <16 x double>* %b
   ret void
 }
 
-define void @ucvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i32_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x11, #12
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    uunpklo z4.d, z0.s
-; VBITS_GE_256-NEXT:    uunpklo z5.d, z1.s
-; VBITS_GE_256-NEXT:    uunpklo z6.d, z2.s
-; VBITS_GE_256-NEXT:    ucvtf z4.d, p0/m, z4.d
-; VBITS_GE_256-NEXT:    ucvtf z5.d, p0/m, z5.d
-; VBITS_GE_256-NEXT:    ucvtf z6.d, p0/m, z6.d
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z7.d, z3.s
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    mov x10, #20
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    uunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    ucvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    ucvtf z7.d, p0/m, z7.d
-; VBITS_GE_256-NEXT:    ucvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v32i32_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @ucvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v32i32_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %res = uitofp <32 x i32> %op1 to <32 x double>
   store <32 x double> %res, <32 x double>* %b
@@ -1059,7 +632,7 @@ define void @ucvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x half> @ucvtf_v1i64_v1f16(<1 x i64> %op1) #0 {
+define <1 x half> @ucvtf_v1i64_v1f16(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v1i64_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -1071,7 +644,7 @@ define <1 x half> @ucvtf_v1i64_v1f16(<1 x i64> %op1) #0 {
 }
 
 ; v2f16 is not legal for NEON, so use SVE
-define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) #0 {
+define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v2i64_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -1085,7 +658,7 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) #0 {
   ret <2 x half> %res
 }
 
-define <4 x half> @ucvtf_v4i64_v4f16(<4 x i64>* %a) #0 {
+define <4 x half> @ucvtf_v4i64_v4f16(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v4i64_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -1134,126 +707,37 @@ define <8 x half> @ucvtf_v8i64_v8f16(<8 x i64>* %a) #0 {
   ret <8 x half> %res
 }
 
-define void @ucvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v16i64_v16f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d
-; VBITS_GE_256-NEXT:    ucvtf z1.h, p0/m, z1.d
-; VBITS_GE_256-NEXT:    ucvtf z0.h, p0/m, z0.d
-; VBITS_GE_256-NEXT:    ucvtf z2.h, p0/m, z2.d
-; VBITS_GE_256-NEXT:    ucvtf z3.h, p0/m, z3.d
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    mov v0.d[1], v1.d[0]
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_256-NEXT:    mov v2.d[1], v1.d[0]
-; VBITS_GE_256-NEXT:    splice z2.h, p0, z2.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v16i64_v16f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.d
-; VBITS_GE_1024-NEXT:    ucvtf z0.h, p0/m, z0.d
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @ucvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v16i64_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %res = uitofp <16 x i64> %op1 to <16 x half>
   store <16 x half> %res, <16 x half>* %b
   ret void
 }
 
-define void @ucvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i64_v32f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    mov x11, #28
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x12, #16
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x13, #20
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d
-; VBITS_GE_256-NEXT:    ucvtf z2.h, p0/m, z2.d
-; VBITS_GE_256-NEXT:    ucvtf z1.h, p0/m, z1.d
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    mov v1.d[1], v2.d[0]
-; VBITS_GE_256-NEXT:    movprfx z2, z6
-; VBITS_GE_256-NEXT:    ucvtf z2.h, p0/m, z6.d
-; VBITS_GE_256-NEXT:    ucvtf z5.h, p0/m, z5.d
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z5.h, z5.h, z5.h
-; VBITS_GE_256-NEXT:    ucvtf z3.h, p0/m, z3.d
-; VBITS_GE_256-NEXT:    mov v5.d[1], v2.d[0]
-; VBITS_GE_256-NEXT:    movprfx z2, z4
-; VBITS_GE_256-NEXT:    ucvtf z2.h, p0/m, z4.d
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    mov v3.d[1], v2.d[0]
-; VBITS_GE_256-NEXT:    ucvtf z0.h, p0/m, z0.d
-; VBITS_GE_256-NEXT:    movprfx z2, z7
-; VBITS_GE_256-NEXT:    ucvtf z2.h, p0/m, z7.d
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_256-NEXT:    mov v2.d[1], v0.d[0]
-; VBITS_GE_256-NEXT:    splice z5.h, p0, z5.h, z1.h
-; VBITS_GE_256-NEXT:    splice z2.h, p0, z2.h, z3.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z5.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.d
-; VBITS_GE_2048-NEXT:    ucvtf z0.h, p0/m, z0.d
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
-  %op1 = load <32 x i64>, <32 x i64>* %a
+define void @ucvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v32i64_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <32 x i64>, <32 x i64>* %a
   %res = uitofp <32 x i64> %op1 to <32 x half>
   store <32 x half> %res, <32 x half>* %b
   ret void
@@ -1264,7 +748,7 @@ define void @ucvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) #0 {
+define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v1i64_v1f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -1276,7 +760,7 @@ define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) #0 {
+define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v2i64_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ucvtf v0.2d, v0.2d
@@ -1286,7 +770,7 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) #0 {
   ret <2 x float> %res
 }
 
-define <4 x float> @ucvtf_v4i64_v4f32(<4 x i64>* %a) #0 {
+define <4 x float> @ucvtf_v4i64_v4f32(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v4i64_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -1335,110 +819,34 @@ define void @ucvtf_v8i64_v8f32(<8 x i64>* %a, <8 x float>* %b) #0 {
   ret void
 }
 
-define void @ucvtf_v16i64_v16f32(<16 x i64>* %a, <16 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v16i64_v16f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ptrue p1.d
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_256-NEXT:    ucvtf z0.s, p1/m, z0.d
-; VBITS_GE_256-NEXT:    ucvtf z2.s, p1/m, z2.d
-; VBITS_GE_256-NEXT:    ucvtf z1.s, p1/m, z1.d
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    ucvtf z3.s, p1/m, z3.d
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    splice z2.s, p0, z2.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    splice z0.s, p0, z0.s, z1.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v16i64_v16f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.d
-; VBITS_GE_1024-NEXT:    ucvtf z0.s, p0/m, z0.d
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @ucvtf_v16i64_v16f32(<16 x i64>* %a, <16 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v16i64_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %res = uitofp <16 x i64> %op1 to <16 x float>
   store <16 x float> %res, <16 x float>* %b
   ret void
 }
 
-define void @ucvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i64_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x11, #8
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x12, #24
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #20
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    ptrue p1.d
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ucvtf z1.s, p1/m, z1.d
-; VBITS_GE_256-NEXT:    ucvtf z2.s, p1/m, z2.d
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    ptrue p2.s, vl4
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    splice z2.s, p2, z2.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z6
-; VBITS_GE_256-NEXT:    ucvtf z1.s, p1/m, z6.d
-; VBITS_GE_256-NEXT:    ucvtf z5.s, p1/m, z5.d
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT:    ucvtf z3.s, p1/m, z3.d
-; VBITS_GE_256-NEXT:    ucvtf z4.s, p1/m, z4.d
-; VBITS_GE_256-NEXT:    splice z5.s, p2, z5.s, z1.s
-; VBITS_GE_256-NEXT:    ucvtf z0.s, p1/m, z0.d
-; VBITS_GE_256-NEXT:    movprfx z1, z7
-; VBITS_GE_256-NEXT:    ucvtf z1.s, p1/m, z7.d
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    splice z4.s, p2, z4.s, z3.s
-; VBITS_GE_256-NEXT:    splice z1.s, p2, z1.s, z0.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.d
-; VBITS_GE_2048-NEXT:    ucvtf z0.s, p0/m, z0.d
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @ucvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v32i64_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %res = uitofp <32 x i64> %op1 to <32 x float>
   store <32 x float> %res, <32 x float>* %b
@@ -1450,7 +858,7 @@ define void @ucvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) #0 {
+define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v1i64_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -1462,7 +870,7 @@ define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) #0 {
+define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v2i64_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ucvtf v0.2d, v0.2d
@@ -1471,7 +879,7 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) #0 {
   ret <2 x double> %res
 }
 
-define void @ucvtf_v4i64_v4f64(<4 x i64>* %a, <4 x double>* %b) #0 {
+define void @ucvtf_v4i64_v4f64(<4 x i64>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v4i64_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -1511,84 +919,28 @@ define void @ucvtf_v8i64_v8f64(<8 x i64>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @ucvtf_v16i64_v16f64(<16 x i64>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v16i64_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    ucvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT:    ucvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: ucvtf_v16i64_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @ucvtf_v16i64_v16f64(<16 x i64>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: ucvtf_v16i64_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %res = uitofp <16 x i64> %op1 to <16 x double>
   store <16 x double> %res, <16 x double>* %b
   ret void
 }
 
-define void @ucvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: ucvtf_v32i64_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    mov x12, #8
-; VBITS_GE_256-NEXT:    mov x13, #28
-; VBITS_GE_256-NEXT:    mov x14, #16
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ucvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    ucvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT:    ucvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT:    ucvtf z5.d, p0/m, z5.d
-; VBITS_GE_256-NEXT:    ucvtf z4.d, p0/m, z4.d
-; VBITS_GE_256-NEXT:    ucvtf z6.d, p0/m, z6.d
-; VBITS_GE_256-NEXT:    ucvtf z7.d, p0/m, z7.d
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @ucvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: ucvtf_v32i64_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %res = uitofp <32 x i64> %op1 to <32 x double>
   store <32 x double> %res, <32 x double>* %b
@@ -1600,7 +952,7 @@ define void @ucvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) #0 {
+define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v4i16_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    scvtf v0.4h, v0.4h
@@ -1610,7 +962,7 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define void @scvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) #0 {
+define void @scvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v8i16_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -1623,7 +975,7 @@ define void @scvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) #0 {
   ret void
 }
 
-define void @scvtf_v16i16_v16f16(<16 x i16>* %a, <16 x half>* %b) #0 {
+define void @scvtf_v16i16_v16f16(<16 x i16>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v16i16_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -1663,84 +1015,28 @@ define void @scvtf_v32i16_v32f16(<32 x i16>* %a, <32 x half>* %b) #0 {
   ret void
 }
 
-define void @scvtf_v64i16_v64f16(<64 x i16>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v64i16_v64f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #48
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #32
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    scvtf z1.h, p0/m, z1.h
-; VBITS_GE_256-NEXT:    scvtf z0.h, p0/m, z0.h
-; VBITS_GE_256-NEXT:    scvtf z2.h, p0/m, z2.h
-; VBITS_GE_256-NEXT:    scvtf z3.h, p0/m, z3.h
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v64i16_v64f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    scvtf z0.h, p0/m, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @scvtf_v64i16_v64f16(<64 x i16>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v64i16_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %res = sitofp <64 x i16> %op1 to <64 x half>
   store <64 x half> %res, <64 x half>* %b
   ret void
 }
 
-define void @scvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v128i16_v128f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #96
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x11, #80
-; VBITS_GE_256-NEXT:    mov x12, #32
-; VBITS_GE_256-NEXT:    mov x13, #112
-; VBITS_GE_256-NEXT:    mov x14, #64
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z6.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z7.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    scvtf z1.h, p0/m, z1.h
-; VBITS_GE_256-NEXT:    scvtf z0.h, p0/m, z0.h
-; VBITS_GE_256-NEXT:    scvtf z3.h, p0/m, z3.h
-; VBITS_GE_256-NEXT:    scvtf z2.h, p0/m, z2.h
-; VBITS_GE_256-NEXT:    scvtf z5.h, p0/m, z5.h
-; VBITS_GE_256-NEXT:    scvtf z4.h, p0/m, z4.h
-; VBITS_GE_256-NEXT:    scvtf z6.h, p0/m, z6.h
-; VBITS_GE_256-NEXT:    scvtf z7.h, p0/m, z7.h
-; VBITS_GE_256-NEXT:    st1h { z6.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z5.h }, p0, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z7.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v128i16_v128f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    scvtf z0.h, p0/m, z0.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @scvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v128i16_v128f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %res = sitofp <128 x i16> %op1 to <128 x half>
   store <128 x half> %res, <128 x half>* %b
@@ -1752,7 +1048,7 @@ define void @scvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) #0 {
+define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v2i16_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #16
@@ -1764,7 +1060,7 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) #0 {
+define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v4i16_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
@@ -1774,7 +1070,7 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) #0 {
   ret <4 x float> %res
 }
 
-define void @scvtf_v8i16_v8f32(<8 x i16>* %a, <8 x float>* %b) #0 {
+define void @scvtf_v8i16_v8f32(<8 x i16>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v8i16_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -1820,102 +1116,32 @@ define void @scvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @scvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i16_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    scvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    scvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT:    scvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT:    scvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v32i16_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    scvtf z0.s, p0/m, z0.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @scvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v32i16_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %res = sitofp <32 x i16> %op1 to <32 x float>
   store <32 x float> %res, <32 x float>* %b
   ret void
 }
 
-define void @scvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v64i16_v64f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #32
-; VBITS_GE_256-NEXT:    mov x10, #48
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x11, #24
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    sunpklo z4.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z5.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z6.s, z2.h
-; VBITS_GE_256-NEXT:    scvtf z4.s, p0/m, z4.s
-; VBITS_GE_256-NEXT:    scvtf z5.s, p0/m, z5.s
-; VBITS_GE_256-NEXT:    scvtf z6.s, p0/m, z6.s
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z7.s, z3.h
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    mov x9, #56
-; VBITS_GE_256-NEXT:    mov x10, #40
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    scvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT:    scvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT:    scvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT:    scvtf z7.s, p0/m, z7.s
-; VBITS_GE_256-NEXT:    scvtf z3.s, p0/m, z3.s
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v64i16_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT:    scvtf z0.s, p0/m, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @scvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v64i16_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %res = sitofp <64 x i16> %op1 to <64 x float>
   store <64 x float> %res, <64 x float>* %b
@@ -1927,7 +1153,7 @@ define void @scvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) #0 {
 ;
 
 ; v1i16 is perfered to be widened to v4i16, which pushes the output into SVE types, so use SVE
-define <1 x double> @scvtf_v1i16_v1f64(<1 x i16> %op1) #0 {
+define <1 x double> @scvtf_v1i16_v1f64(<1 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v1i16_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -1942,7 +1168,7 @@ define <1 x double> @scvtf_v1i16_v1f64(<1 x i16> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) #0 {
+define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v2i16_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #16
@@ -1954,7 +1180,7 @@ define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) #0 {
   ret <2 x double> %res
 }
 
-define void @scvtf_v4i16_v4f64(<4 x i16>* %a, <4 x double>* %b) #0 {
+define void @scvtf_v4i16_v4f64(<4 x i16>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v4i16_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -2002,119 +1228,34 @@ define void @scvtf_v8i16_v8f64(<8 x i16>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @scvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v16i16_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    mov z1.d, z0.d
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z0.h
-; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z3.h
-; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z2.s
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v16i16_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @scvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v16i16_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %res = sitofp <16 x i16> %op1 to <16 x double>
   store <16 x double> %res, <16 x double>* %b
   ret void
 }
 
-define void @scvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i16_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x9, #20
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    scvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT:    sunpklo z4.s, z1.h
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    scvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT:    ext v6.16b, v1.16b, v1.16b, #8
-; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT:    scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    sunpklo z6.s, z6.h
-; VBITS_GE_256-NEXT:    movprfx z0, z5
-; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z5.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #28
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z6.s
-; VBITS_GE_256-NEXT:    scvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT:    scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    sunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT:    movprfx z0, z4
-; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z4.d
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v32i16_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @scvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v32i16_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %res = sitofp <32 x i16> %op1 to <32 x double>
   store <32 x double> %res, <32 x double>* %b
@@ -2126,7 +1267,7 @@ define void @scvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) #0 {
+define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v2i32_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -2138,7 +1279,7 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) #0 {
+define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v4i32_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    scvtf v0.4s, v0.4s
@@ -2148,7 +1289,7 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) #0 {
   ret <4 x half> %res
 }
 
-define <8 x half> @scvtf_v8i32_v8f16(<8 x i32>* %a) #0 {
+define <8 x half> @scvtf_v8i32_v8f16(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v8i32_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -2197,110 +1338,34 @@ define void @scvtf_v16i32_v16f16(<16 x i32>* %a, <16 x half>* %b) #0 {
   ret void
 }
 
-define void @scvtf_v32i32_v32f16(<32 x i32>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i32_v32f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ptrue p1.s
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_256-NEXT:    scvtf z0.h, p1/m, z0.s
-; VBITS_GE_256-NEXT:    scvtf z2.h, p1/m, z2.s
-; VBITS_GE_256-NEXT:    scvtf z1.h, p1/m, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    scvtf z3.h, p1/m, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    splice z2.h, p0, z2.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    splice z0.h, p0, z0.h, z1.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v32i32_v32f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.s
-; VBITS_GE_1024-NEXT:    scvtf z0.h, p0/m, z0.s
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @scvtf_v32i32_v32f16(<32 x i32>* %a, <32 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v32i32_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %res = sitofp <32 x i32> %op1 to <32 x half>
   store <32 x half> %res, <32 x half>* %b
   ret void
 }
 
-define void @scvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v64i32_v64f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x11, #16
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x12, #48
-; VBITS_GE_256-NEXT:    mov x10, #32
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #40
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    mov x9, #56
-; VBITS_GE_256-NEXT:    ptrue p1.s
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    scvtf z1.h, p1/m, z1.s
-; VBITS_GE_256-NEXT:    scvtf z2.h, p1/m, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    ptrue p2.h, vl8
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    splice z2.h, p2, z2.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z6
-; VBITS_GE_256-NEXT:    scvtf z1.h, p1/m, z6.s
-; VBITS_GE_256-NEXT:    scvtf z5.h, p1/m, z5.s
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    uzp1 z5.h, z5.h, z5.h
-; VBITS_GE_256-NEXT:    scvtf z3.h, p1/m, z3.s
-; VBITS_GE_256-NEXT:    scvtf z4.h, p1/m, z4.s
-; VBITS_GE_256-NEXT:    splice z5.h, p2, z5.h, z1.h
-; VBITS_GE_256-NEXT:    scvtf z0.h, p1/m, z0.s
-; VBITS_GE_256-NEXT:    movprfx z1, z7
-; VBITS_GE_256-NEXT:    scvtf z1.h, p1/m, z7.s
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    splice z4.h, p2, z4.h, z3.h
-; VBITS_GE_256-NEXT:    splice z1.h, p2, z1.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z5.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v64i32_v64f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.s
-; VBITS_GE_2048-NEXT:    scvtf z0.h, p0/m, z0.s
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @scvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v64i32_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %res = sitofp <64 x i32> %op1 to <64 x half>
   store <64 x half> %res, <64 x half>* %b
@@ -2312,7 +1377,7 @@ define void @scvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) #0 {
+define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v2i32_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    scvtf v0.2s, v0.2s
@@ -2322,7 +1387,7 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) #0 {
+define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v4i32_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    scvtf v0.4s, v0.4s
@@ -2331,7 +1396,7 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) #0 {
   ret <4 x float> %res
 }
 
-define void @scvtf_v8i32_v8f32(<8 x i32>* %a, <8 x float>* %b) #0 {
+define void @scvtf_v8i32_v8f32(<8 x i32>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v8i32_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -2371,84 +1436,28 @@ define void @scvtf_v16i32_v16f32(<16 x i32>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @scvtf_v32i32_v32f32(<32 x i32>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i32_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    scvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT:    scvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT:    scvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT:    scvtf z3.s, p0/m, z3.s
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v32i32_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    scvtf z0.s, p0/m, z0.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @scvtf_v32i32_v32f32(<32 x i32>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v32i32_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %res = sitofp <32 x i32> %op1 to <32 x float>
   store <32 x float> %res, <32 x float>* %b
   ret void
 }
 
-define void @scvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v64i32_v64f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #48
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    mov x11, #40
-; VBITS_GE_256-NEXT:    mov x12, #16
-; VBITS_GE_256-NEXT:    mov x13, #56
-; VBITS_GE_256-NEXT:    mov x14, #32
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    scvtf z1.s, p0/m, z1.s
-; VBITS_GE_256-NEXT:    scvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT:    scvtf z3.s, p0/m, z3.s
-; VBITS_GE_256-NEXT:    scvtf z2.s, p0/m, z2.s
-; VBITS_GE_256-NEXT:    scvtf z5.s, p0/m, z5.s
-; VBITS_GE_256-NEXT:    scvtf z4.s, p0/m, z4.s
-; VBITS_GE_256-NEXT:    scvtf z6.s, p0/m, z6.s
-; VBITS_GE_256-NEXT:    scvtf z7.s, p0/m, z7.s
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v64i32_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    scvtf z0.s, p0/m, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @scvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v64i32_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %res = sitofp <64 x i32> %op1 to <64 x float>
   store <64 x float> %res, <64 x float>* %b
@@ -2460,7 +1469,7 @@ define void @scvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @scvtf_v1i32_v1f64(<1 x i32> %op1) #0 {
+define <1 x double> @scvtf_v1i32_v1f64(<1 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v1i32_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
@@ -2472,7 +1481,7 @@ define <1 x double> @scvtf_v1i32_v1f64(<1 x i32> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) #0 {
+define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v2i32_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
@@ -2482,7 +1491,7 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) #0 {
   ret <2 x double> %res
 }
 
-define void @scvtf_v4i32_v4f64(<4 x i32>* %a, <4 x double>* %b) #0 {
+define void @scvtf_v4i32_v4f64(<4 x i32>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v4i32_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -2528,102 +1537,32 @@ define void @scvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @scvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v16i32_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    scvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z1.s
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    scvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v16i32_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @scvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v16i32_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %res = sitofp <16 x i32> %op1 to <16 x double>
   store <16 x double> %res, <16 x double>* %b
   ret void
 }
 
-define void @scvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i32_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x11, #12
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    sunpklo z4.d, z0.s
-; VBITS_GE_256-NEXT:    sunpklo z5.d, z1.s
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z2.s
-; VBITS_GE_256-NEXT:    scvtf z4.d, p0/m, z4.d
-; VBITS_GE_256-NEXT:    scvtf z5.d, p0/m, z5.d
-; VBITS_GE_256-NEXT:    scvtf z6.d, p0/m, z6.d
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z7.d, z3.s
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    mov x10, #20
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    scvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    scvtf z7.d, p0/m, z7.d
-; VBITS_GE_256-NEXT:    scvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v32i32_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @scvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v32i32_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %res = sitofp <32 x i32> %op1 to <32 x double>
   store <32 x double> %res, <32 x double>* %b
@@ -2635,7 +1574,7 @@ define void @scvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x half> @scvtf_v1i64_v1f16(<1 x i64> %op1) #0 {
+define <1 x half> @scvtf_v1i64_v1f16(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v1i64_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -2647,7 +1586,7 @@ define <1 x half> @scvtf_v1i64_v1f16(<1 x i64> %op1) #0 {
 }
 
 ; v2f16 is not legal for NEON, so use SVE
-define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) #0 {
+define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v2i64_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -2661,7 +1600,7 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) #0 {
   ret <2 x half> %res
 }
 
-define <4 x half> @scvtf_v4i64_v4f16(<4 x i64>* %a) #0 {
+define <4 x half> @scvtf_v4i64_v4f16(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v4i64_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -2710,125 +1649,36 @@ define <8 x half> @scvtf_v8i64_v8f16(<8 x i64>* %a) #0 {
   ret <8 x half> %res
 }
 
-define void @scvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v16i64_v16f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d
-; VBITS_GE_256-NEXT:    scvtf z1.h, p0/m, z1.d
-; VBITS_GE_256-NEXT:    scvtf z0.h, p0/m, z0.d
-; VBITS_GE_256-NEXT:    scvtf z2.h, p0/m, z2.d
-; VBITS_GE_256-NEXT:    scvtf z3.h, p0/m, z3.d
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    mov v0.d[1], v1.d[0]
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_256-NEXT:    mov v2.d[1], v1.d[0]
-; VBITS_GE_256-NEXT:    splice z2.h, p0, z2.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v16i64_v16f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.d
-; VBITS_GE_1024-NEXT:    scvtf z0.h, p0/m, z0.d
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @scvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v16i64_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %res = sitofp <16 x i64> %op1 to <16 x half>
   store <16 x half> %res, <16 x half>* %b
   ret void
 }
 
-define void @scvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i64_v32f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    mov x11, #28
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x12, #16
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x13, #20
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d
-; VBITS_GE_256-NEXT:    scvtf z2.h, p0/m, z2.d
-; VBITS_GE_256-NEXT:    scvtf z1.h, p0/m, z1.d
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    mov v1.d[1], v2.d[0]
-; VBITS_GE_256-NEXT:    movprfx z2, z6
-; VBITS_GE_256-NEXT:    scvtf z2.h, p0/m, z6.d
-; VBITS_GE_256-NEXT:    scvtf z5.h, p0/m, z5.d
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z5.h, z5.h, z5.h
-; VBITS_GE_256-NEXT:    scvtf z3.h, p0/m, z3.d
-; VBITS_GE_256-NEXT:    mov v5.d[1], v2.d[0]
-; VBITS_GE_256-NEXT:    movprfx z2, z4
-; VBITS_GE_256-NEXT:    scvtf z2.h, p0/m, z4.d
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    mov v3.d[1], v2.d[0]
-; VBITS_GE_256-NEXT:    scvtf z0.h, p0/m, z0.d
-; VBITS_GE_256-NEXT:    movprfx z2, z7
-; VBITS_GE_256-NEXT:    scvtf z2.h, p0/m, z7.d
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_256-NEXT:    mov v2.d[1], v0.d[0]
-; VBITS_GE_256-NEXT:    splice z5.h, p0, z5.h, z1.h
-; VBITS_GE_256-NEXT:    splice z2.h, p0, z2.h, z3.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z5.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v32i64_v32f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.d
-; VBITS_GE_2048-NEXT:    scvtf z0.h, p0/m, z0.d
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @scvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v32i64_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %res = sitofp <32 x i64> %op1 to <32 x half>
   store <32 x half> %res, <32 x half>* %b
@@ -2840,7 +1690,7 @@ define void @scvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) #0 {
+define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v1i64_v1f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -2852,7 +1702,7 @@ define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) #0 {
+define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v2i64_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    scvtf v0.2d, v0.2d
@@ -2862,7 +1712,7 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) #0 {
   ret <2 x float> %res
 }
 
-define <4 x float> @scvtf_v4i64_v4f32(<4 x i64>* %a) #0 {
+define <4 x float> @scvtf_v4i64_v4f32(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v4i64_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -2911,110 +1761,34 @@ define void @scvtf_v8i64_v8f32(<8 x i64>* %a, <8 x float>* %b) #0 {
   ret void
 }
 
-define void @scvtf_v16i64_v16f32(<16 x i64>* %a, <16 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v16i64_v16f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ptrue p1.d
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_256-NEXT:    scvtf z0.s, p1/m, z0.d
-; VBITS_GE_256-NEXT:    scvtf z2.s, p1/m, z2.d
-; VBITS_GE_256-NEXT:    scvtf z1.s, p1/m, z1.d
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    scvtf z3.s, p1/m, z3.d
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    splice z2.s, p0, z2.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    splice z0.s, p0, z0.s, z1.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v16i64_v16f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.d
-; VBITS_GE_1024-NEXT:    scvtf z0.s, p0/m, z0.d
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @scvtf_v16i64_v16f32(<16 x i64>* %a, <16 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v16i64_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %res = sitofp <16 x i64> %op1 to <16 x float>
   store <16 x float> %res, <16 x float>* %b
   ret void
 }
 
-define void @scvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i64_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x11, #8
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x12, #24
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #20
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    ptrue p1.d
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    scvtf z1.s, p1/m, z1.d
-; VBITS_GE_256-NEXT:    scvtf z2.s, p1/m, z2.d
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    ptrue p2.s, vl4
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    splice z2.s, p2, z2.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z6
-; VBITS_GE_256-NEXT:    scvtf z1.s, p1/m, z6.d
-; VBITS_GE_256-NEXT:    scvtf z5.s, p1/m, z5.d
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT:    scvtf z3.s, p1/m, z3.d
-; VBITS_GE_256-NEXT:    scvtf z4.s, p1/m, z4.d
-; VBITS_GE_256-NEXT:    splice z5.s, p2, z5.s, z1.s
-; VBITS_GE_256-NEXT:    scvtf z0.s, p1/m, z0.d
-; VBITS_GE_256-NEXT:    movprfx z1, z7
-; VBITS_GE_256-NEXT:    scvtf z1.s, p1/m, z7.d
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    splice z4.s, p2, z4.s, z3.s
-; VBITS_GE_256-NEXT:    splice z1.s, p2, z1.s, z0.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v32i64_v32f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ptrue p0.d
-; VBITS_GE_2048-NEXT:    scvtf z0.s, p0/m, z0.d
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @scvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v32i64_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %res = sitofp <32 x i64> %op1 to <32 x float>
   store <32 x float> %res, <32 x float>* %b
@@ -3026,7 +1800,7 @@ define void @scvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) #0 {
+define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v1i64_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -3038,7 +1812,7 @@ define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) #0 {
+define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v2i64_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    scvtf v0.2d, v0.2d
@@ -3047,7 +1821,7 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) #0 {
   ret <2 x double> %res
 }
 
-define void @scvtf_v4i64_v4f64(<4 x i64>* %a, <4 x double>* %b) #0 {
+define void @scvtf_v4i64_v4f64(<4 x i64>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v4i64_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -3087,84 +1861,28 @@ define void @scvtf_v8i64_v8f64(<8 x i64>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @scvtf_v16i64_v16f64(<16 x i64>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v16i64_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    scvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT:    scvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: scvtf_v16i64_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @scvtf_v16i64_v16f64(<16 x i64>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: scvtf_v16i64_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %res = sitofp <16 x i64> %op1 to <16 x double>
   store <16 x double> %res, <16 x double>* %b
   ret void
 }
 
-define void @scvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: scvtf_v32i64_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    mov x12, #8
-; VBITS_GE_256-NEXT:    mov x13, #28
-; VBITS_GE_256-NEXT:    mov x14, #16
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    scvtf z1.d, p0/m, z1.d
-; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    scvtf z3.d, p0/m, z3.d
-; VBITS_GE_256-NEXT:    scvtf z2.d, p0/m, z2.d
-; VBITS_GE_256-NEXT:    scvtf z5.d, p0/m, z5.d
-; VBITS_GE_256-NEXT:    scvtf z4.d, p0/m, z4.d
-; VBITS_GE_256-NEXT:    scvtf z6.d, p0/m, z6.d
-; VBITS_GE_256-NEXT:    scvtf z7.d, p0/m, z7.d
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: scvtf_v32i64_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @scvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: scvtf_v32i64_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %res = sitofp <32 x i64> %op1 to <32 x double>
   store <32 x double> %res, <32 x double>* %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
index 86e5092b71af..a5b725a8fa1e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
@@ -1,26 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) #0 {
+define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v2.8b, v2.8b, #7
@@ -32,7 +18,7 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) #0 {
+define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v2.16b, v2.16b, #7
@@ -43,1116 +29,96 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
   ret <16 x i8> %sel
 }
 
-define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, <32 x i1>* %c) #0 {
+define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    sub x9, sp, #48
-; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT:    ldr w8, [x2]
 ; CHECK-NEXT:    ptrue p0.b, vl32
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    asr w9, w8, #31
-; CHECK-NEXT:    sbfx w10, w8, #30, #1
-; CHECK-NEXT:    sbfx w11, w8, #29, #1
-; CHECK-NEXT:    strb w9, [sp, #31]
-; CHECK-NEXT:    sbfx w9, w8, #28, #1
-; CHECK-NEXT:    strb w10, [sp, #30]
-; CHECK-NEXT:    sbfx w10, w8, #27, #1
-; CHECK-NEXT:    strb w11, [sp, #29]
-; CHECK-NEXT:    sbfx w11, w8, #26, #1
-; CHECK-NEXT:    strb w9, [sp, #28]
-; CHECK-NEXT:    sbfx w9, w8, #25, #1
-; CHECK-NEXT:    strb w10, [sp, #27]
-; CHECK-NEXT:    sbfx w10, w8, #24, #1
-; CHECK-NEXT:    strb w11, [sp, #26]
-; CHECK-NEXT:    sbfx w11, w8, #23, #1
-; CHECK-NEXT:    strb w9, [sp, #25]
-; CHECK-NEXT:    sbfx w9, w8, #22, #1
-; CHECK-NEXT:    strb w10, [sp, #24]
-; CHECK-NEXT:    sbfx w10, w8, #21, #1
-; CHECK-NEXT:    strb w11, [sp, #23]
-; CHECK-NEXT:    sbfx w11, w8, #20, #1
-; CHECK-NEXT:    strb w9, [sp, #22]
-; CHECK-NEXT:    sbfx w9, w8, #19, #1
-; CHECK-NEXT:    strb w10, [sp, #21]
-; CHECK-NEXT:    sbfx w10, w8, #18, #1
-; CHECK-NEXT:    strb w11, [sp, #20]
-; CHECK-NEXT:    sbfx w11, w8, #17, #1
-; CHECK-NEXT:    strb w9, [sp, #19]
-; CHECK-NEXT:    sbfx w9, w8, #16, #1
-; CHECK-NEXT:    strb w10, [sp, #18]
-; CHECK-NEXT:    sbfx w10, w8, #15, #1
-; CHECK-NEXT:    strb w11, [sp, #17]
-; CHECK-NEXT:    sbfx w11, w8, #14, #1
-; CHECK-NEXT:    strb w9, [sp, #16]
-; CHECK-NEXT:    sbfx w9, w8, #13, #1
-; CHECK-NEXT:    strb w10, [sp, #15]
-; CHECK-NEXT:    sbfx w10, w8, #12, #1
-; CHECK-NEXT:    strb w11, [sp, #14]
-; CHECK-NEXT:    sbfx w11, w8, #11, #1
-; CHECK-NEXT:    strb w9, [sp, #13]
-; CHECK-NEXT:    sbfx w9, w8, #10, #1
-; CHECK-NEXT:    strb w10, [sp, #12]
-; CHECK-NEXT:    sbfx w10, w8, #9, #1
-; CHECK-NEXT:    strb w11, [sp, #11]
-; CHECK-NEXT:    sbfx w11, w8, #8, #1
-; CHECK-NEXT:    strb w9, [sp, #10]
-; CHECK-NEXT:    sbfx w9, w8, #7, #1
-; CHECK-NEXT:    strb w10, [sp, #9]
-; CHECK-NEXT:    sbfx w10, w8, #6, #1
-; CHECK-NEXT:    strb w11, [sp, #8]
-; CHECK-NEXT:    sbfx w11, w8, #5, #1
-; CHECK-NEXT:    strb w9, [sp, #7]
-; CHECK-NEXT:    sbfx w9, w8, #4, #1
-; CHECK-NEXT:    strb w10, [sp, #6]
-; CHECK-NEXT:    sbfx w10, w8, #3, #1
-; CHECK-NEXT:    strb w11, [sp, #5]
-; CHECK-NEXT:    sbfx w11, w8, #2, #1
-; CHECK-NEXT:    strb w9, [sp, #4]
-; CHECK-NEXT:    sbfx w9, w8, #1, #1
-; CHECK-NEXT:    sbfx w8, w8, #0, #1
-; CHECK-NEXT:    strb w10, [sp, #3]
-; CHECK-NEXT:    strb w11, [sp, #2]
-; CHECK-NEXT:    strb w9, [sp, #1]
-; CHECK-NEXT:    strb w8, [sp]
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
-; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x0]
-; CHECK-NEXT:    ld1b { z2.b }, p0/z, [x1]
-; CHECK-NEXT:    and z0.b, z0.b, #0x1
-; CHECK-NEXT:    cmpne p1.b, p1/z, z0.b, #0
-; CHECK-NEXT:    sel z0.b, p1, z1.b, z2.b
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    sel z0.b, p1, z0.b, z1.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    .cfi_restore w30
-; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    ret
-  %mask = load <32 x i1>, <32 x i1>* %c
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
+  %mask = icmp eq <32 x i8> %op1, %op2
   %sel = select <32 x i1> %mask, <32 x i8> %op1, <32 x i8> %op2
   store <32 x i8> %sel, <32 x i8>* %a
   ret void
 }
 
-define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, <64 x i1>* %c) #0 {
+define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
+; VBITS_GE_256-LABEL: select_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.b, p0/z, z0.b, z2.b
+; VBITS_GE_256-NEXT:    cmpeq p2.b, p0/z, z1.b, z3.b
+; VBITS_GE_256-NEXT:    sel z0.b, p1, z0.b, z2.b
+; VBITS_GE_256-NEXT:    sel z1.b, p2, z1.b, z3.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: select_v64i8:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_512-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_512-NEXT:    mov x29, sp
-; VBITS_GE_512-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_512-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_512-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_512-NEXT:    sub x9, sp, #112
-; VBITS_GE_512-NEXT:    and sp, x9, #0xffffffffffffffc0
-; VBITS_GE_512-NEXT:    ldr x8, [x2]
 ; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
-; VBITS_GE_512-NEXT:    ptrue p1.b
-; VBITS_GE_512-NEXT:    asr x9, x8, #63
-; VBITS_GE_512-NEXT:    sbfx x10, x8, #62, #1
-; VBITS_GE_512-NEXT:    sbfx x11, x8, #61, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #63]
-; VBITS_GE_512-NEXT:    sbfx x9, x8, #60, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #62]
-; VBITS_GE_512-NEXT:    sbfx x10, x8, #59, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #61]
-; VBITS_GE_512-NEXT:    sbfx x11, x8, #58, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #60]
-; VBITS_GE_512-NEXT:    sbfx x9, x8, #57, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #59]
-; VBITS_GE_512-NEXT:    sbfx x10, x8, #56, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #58]
-; VBITS_GE_512-NEXT:    sbfx x11, x8, #55, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #57]
-; VBITS_GE_512-NEXT:    sbfx x9, x8, #54, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #56]
-; VBITS_GE_512-NEXT:    sbfx x10, x8, #53, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #55]
-; VBITS_GE_512-NEXT:    sbfx x11, x8, #52, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #54]
-; VBITS_GE_512-NEXT:    sbfx x9, x8, #51, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #53]
-; VBITS_GE_512-NEXT:    sbfx x10, x8, #50, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #52]
-; VBITS_GE_512-NEXT:    sbfx x11, x8, #49, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #51]
-; VBITS_GE_512-NEXT:    sbfx x9, x8, #48, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #50]
-; VBITS_GE_512-NEXT:    sbfx x10, x8, #47, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #49]
-; VBITS_GE_512-NEXT:    sbfx x11, x8, #46, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #48]
-; VBITS_GE_512-NEXT:    sbfx x9, x8, #45, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #47]
-; VBITS_GE_512-NEXT:    sbfx x10, x8, #44, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #46]
-; VBITS_GE_512-NEXT:    sbfx x11, x8, #43, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #45]
-; VBITS_GE_512-NEXT:    sbfx x9, x8, #42, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #44]
-; VBITS_GE_512-NEXT:    sbfx x10, x8, #41, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #43]
-; VBITS_GE_512-NEXT:    sbfx x11, x8, #40, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #42]
-; VBITS_GE_512-NEXT:    sbfx x9, x8, #39, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #41]
-; VBITS_GE_512-NEXT:    sbfx x10, x8, #38, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #40]
-; VBITS_GE_512-NEXT:    sbfx x11, x8, #37, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #39]
-; VBITS_GE_512-NEXT:    sbfx x9, x8, #36, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #38]
-; VBITS_GE_512-NEXT:    sbfx x10, x8, #35, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #37]
-; VBITS_GE_512-NEXT:    sbfx x11, x8, #34, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #36]
-; VBITS_GE_512-NEXT:    sbfx x9, x8, #33, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #35]
-; VBITS_GE_512-NEXT:    sbfx x10, x8, #32, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #34]
-; VBITS_GE_512-NEXT:    asr w11, w8, #31
-; VBITS_GE_512-NEXT:    strb w9, [sp, #33]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #30, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #32]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #29, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #31]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #28, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #30]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #27, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #29]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #26, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #28]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #25, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #27]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #24, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #26]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #23, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #25]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #22, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #24]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #21, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #23]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #20, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #22]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #19, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #21]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #18, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #20]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #17, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #19]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #16, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #18]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #17]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #16]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #15]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #12, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #14]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #11, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #13]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #10, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #12]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #9, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #11]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #8, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #10]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #7, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #9]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #6, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #8]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #5, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #7]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #4, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #6]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #5]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #4]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_512-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_512-NEXT:    strb w9, [sp, #3]
-; VBITS_GE_512-NEXT:    strb w10, [sp, #2]
-; VBITS_GE_512-NEXT:    strb w11, [sp, #1]
-; VBITS_GE_512-NEXT:    strb w8, [sp]
-; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [sp]
-; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1b { z2.b }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    and z0.b, z0.b, #0x1
-; VBITS_GE_512-NEXT:    cmpne p1.b, p1/z, z0.b, #0
-; VBITS_GE_512-NEXT:    sel z0.b, p1, z1.b, z2.b
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpeq p1.b, p0/z, z0.b, z1.b
+; VBITS_GE_512-NEXT:    sel z0.b, p1, z0.b, z1.b
 ; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_512-NEXT:    mov sp, x29
-; VBITS_GE_512-NEXT:    .cfi_def_cfa wsp, 16
-; VBITS_GE_512-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_512-NEXT:    .cfi_def_cfa_offset 0
-; VBITS_GE_512-NEXT:    .cfi_restore w30
-; VBITS_GE_512-NEXT:    .cfi_restore w29
 ; VBITS_GE_512-NEXT:    ret
-  %mask = load <64 x i1>, <64 x i1>* %c
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
+  %mask = icmp eq <64 x i8> %op1, %op2
   %sel = select <64 x i1> %mask, <64 x i8> %op1, <64 x i8> %op2
   store <64 x i8> %sel, <64 x i8>* %a
   ret void
 }
 
-define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, <128 x i1>* %c) #0 {
-; VBITS_GE_1024-LABEL: select_v128i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_1024-NEXT:    mov x29, sp
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_1024-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_1024-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT:    sub x9, sp, #240
-; VBITS_GE_1024-NEXT:    and sp, x9, #0xffffffffffffff80
-; VBITS_GE_1024-NEXT:    ldr x8, [x2, #8]
-; VBITS_GE_1024-NEXT:    ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT:    ptrue p1.b
-; VBITS_GE_1024-NEXT:    asr x9, x8, #63
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #62, #1
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #61, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #127]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #60, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #126]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #59, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #125]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #58, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #124]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #57, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #123]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #56, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #122]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #55, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #121]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #54, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #120]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #53, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #119]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #52, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #118]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #51, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #117]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #50, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #116]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #49, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #115]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #48, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #114]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #47, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #113]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #46, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #112]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #45, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #111]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #44, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #110]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #43, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #109]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #42, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #108]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #41, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #107]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #40, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #106]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #39, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #105]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #38, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #104]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #37, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #103]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #36, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #102]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #35, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #101]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #34, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #100]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #33, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #99]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #32, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #98]
-; VBITS_GE_1024-NEXT:    asr w11, w8, #31
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #97]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #30, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #96]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #29, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #95]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #28, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #94]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #27, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #93]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #26, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #92]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #25, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #91]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #24, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #90]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #23, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #89]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #22, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #88]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #21, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #87]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #20, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #86]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #19, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #85]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #18, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #84]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #17, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #83]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #16, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #82]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #81]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #80]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #79]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #12, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #78]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #11, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #77]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #10, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #76]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #9, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #75]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #8, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #74]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #7, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #73]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #6, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #72]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #5, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #71]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #4, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #70]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #69]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #68]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_1024-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #67]
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #66]
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #65]
-; VBITS_GE_1024-NEXT:    strb w8, [sp, #64]
-; VBITS_GE_1024-NEXT:    ldr x8, [x2]
-; VBITS_GE_1024-NEXT:    asr x9, x8, #63
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #62, #1
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #61, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #63]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #60, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #62]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #59, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #61]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #58, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #60]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #57, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #59]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #56, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #58]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #55, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #57]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #54, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #56]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #53, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #55]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #52, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #54]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #51, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #53]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #50, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #52]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #49, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #51]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #48, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #50]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #47, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #49]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #46, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #48]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #45, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #47]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #44, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #46]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #43, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #45]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #42, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #44]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #41, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #43]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #40, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #42]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #39, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #41]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #38, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #40]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #37, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #39]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #36, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #38]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #35, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #37]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #34, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #36]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #33, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #35]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #32, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #34]
-; VBITS_GE_1024-NEXT:    asr w11, w8, #31
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #33]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #30, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #32]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #29, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #31]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #28, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #30]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #27, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #29]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #26, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #28]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #25, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #27]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #24, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #26]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #23, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #25]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #22, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #24]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #21, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #23]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #20, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #22]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #19, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #21]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #18, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #20]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #17, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #19]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #16, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #18]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #17]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #16]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #15]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #12, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #14]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #11, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #13]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #10, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #12]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #9, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #11]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #8, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #10]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #7, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #9]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #6, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #8]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #5, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #7]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #4, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #6]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #5]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #4]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_1024-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #3]
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #2]
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #1]
-; VBITS_GE_1024-NEXT:    strb w8, [sp]
-; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [sp]
-; VBITS_GE_1024-NEXT:    ld1b { z1.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1b { z2.b }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    and z0.b, z0.b, #0x1
-; VBITS_GE_1024-NEXT:    cmpne p1.b, p1/z, z0.b, #0
-; VBITS_GE_1024-NEXT:    sel z0.b, p1, z1.b, z2.b
-; VBITS_GE_1024-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_1024-NEXT:    mov sp, x29
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa wsp, 16
-; VBITS_GE_1024-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa_offset 0
-; VBITS_GE_1024-NEXT:    .cfi_restore w30
-; VBITS_GE_1024-NEXT:    .cfi_restore w29
-; VBITS_GE_1024-NEXT:    ret
-  %mask = load <128 x i1>, <128 x i1>* %c
+define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v128i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
+  %mask = icmp eq <128 x i8> %op1, %op2
   %sel = select <128 x i1> %mask, <128 x i8> %op1, <128 x i8> %op2
   store <128 x i8> %sel, <128 x i8>* %a
   ret void
 }
 
-define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, <256 x i1>* %c) #0 {
-; VBITS_GE_2048-LABEL: select_v256i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_2048-NEXT:    mov x29, sp
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_2048-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_2048-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT:    sub x9, sp, #496
-; VBITS_GE_2048-NEXT:    and sp, x9, #0xffffffffffffff00
-; VBITS_GE_2048-NEXT:    ldr x8, [x2, #24]
-; VBITS_GE_2048-NEXT:    ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT:    ptrue p1.b
-; VBITS_GE_2048-NEXT:    asr x9, x8, #63
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #255]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #254]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #253]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #252]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #251]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #250]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #249]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #248]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #247]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #246]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #245]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #244]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #243]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #242]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #241]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #240]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #239]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #238]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #237]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #236]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #235]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #234]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #233]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #232]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #231]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #230]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #229]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #228]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #227]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #226]
-; VBITS_GE_2048-NEXT:    asr w11, w8, #31
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #225]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #224]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #223]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #222]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #221]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #220]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #219]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #218]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #217]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #216]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #215]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #214]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #213]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #212]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #211]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #210]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #209]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #208]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #207]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #206]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #205]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #204]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #203]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #202]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #201]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #200]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #199]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #198]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #197]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #196]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #195]
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #194]
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #193]
-; VBITS_GE_2048-NEXT:    strb w8, [sp, #192]
-; VBITS_GE_2048-NEXT:    ldr x8, [x2, #16]
-; VBITS_GE_2048-NEXT:    asr x9, x8, #63
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #191]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #190]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #189]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #188]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #187]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #186]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #185]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #184]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #183]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #182]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #181]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #180]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #179]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #178]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #177]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #176]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #175]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #174]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #173]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #172]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #171]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #170]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #169]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #168]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #167]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #166]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #165]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #164]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #163]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #162]
-; VBITS_GE_2048-NEXT:    asr w11, w8, #31
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #161]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #160]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #159]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #158]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #157]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #156]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #155]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #154]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #153]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #152]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #151]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #150]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #149]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #148]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #147]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #146]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #145]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #144]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #143]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #142]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #141]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #140]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #139]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #138]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #137]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #136]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #135]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #134]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #133]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #132]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #131]
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #130]
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #129]
-; VBITS_GE_2048-NEXT:    strb w8, [sp, #128]
-; VBITS_GE_2048-NEXT:    ldr x8, [x2, #8]
-; VBITS_GE_2048-NEXT:    asr x9, x8, #63
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #127]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #126]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #125]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #124]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #123]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #122]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #121]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #120]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #119]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #118]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #117]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #116]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #115]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #114]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #113]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #112]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #111]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #110]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #109]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #108]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #107]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #106]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #105]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #104]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #103]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #102]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #101]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #100]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #99]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #98]
-; VBITS_GE_2048-NEXT:    asr w11, w8, #31
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #97]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #96]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #95]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #94]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #93]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #92]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #91]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #90]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #89]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #88]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #87]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #86]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #85]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #84]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #83]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #82]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #81]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #80]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #79]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #78]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #77]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #76]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #75]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #74]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #73]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #72]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #71]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #70]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #69]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #68]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #67]
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #66]
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #65]
-; VBITS_GE_2048-NEXT:    strb w8, [sp, #64]
-; VBITS_GE_2048-NEXT:    ldr x8, [x2]
-; VBITS_GE_2048-NEXT:    asr x9, x8, #63
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #63]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #62]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #61]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #60]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #59]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #58]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #57]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #56]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #55]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #54]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #53]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #52]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #51]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #50]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #49]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #48]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #47]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #46]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #45]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #44]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #43]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #42]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #41]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #40]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #39]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #38]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #37]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #36]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #35]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #34]
-; VBITS_GE_2048-NEXT:    asr w11, w8, #31
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #33]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #32]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #31]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #30]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #29]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #28]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #27]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #26]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #25]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #24]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #23]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #22]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #21]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #20]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #19]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #18]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #17]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #16]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #15]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #14]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #13]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #12]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #11]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #10]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #9]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #8]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #7]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #6]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #5]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #4]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #3]
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #2]
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #1]
-; VBITS_GE_2048-NEXT:    strb w8, [sp]
-; VBITS_GE_2048-NEXT:    ld1b { z0.b }, p0/z, [sp]
-; VBITS_GE_2048-NEXT:    ld1b { z1.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1b { z2.b }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    and z0.b, z0.b, #0x1
-; VBITS_GE_2048-NEXT:    cmpne p1.b, p1/z, z0.b, #0
-; VBITS_GE_2048-NEXT:    sel z0.b, p1, z1.b, z2.b
-; VBITS_GE_2048-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_2048-NEXT:    mov sp, x29
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa wsp, 16
-; VBITS_GE_2048-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa_offset 0
-; VBITS_GE_2048-NEXT:    .cfi_restore w30
-; VBITS_GE_2048-NEXT:    .cfi_restore w29
-; VBITS_GE_2048-NEXT:    ret
-  %mask = load <256 x i1>, <256 x i1>* %c
+define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v256i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
+  %mask = icmp eq <256 x i8> %op1, %op2
   %sel = select <256 x i1> %mask, <256 x i8> %op1, <256 x i8> %op2
   store <256 x i8> %sel, <256 x i8>* %a
   ret void
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) #0 {
+define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v2.4h, v2.4h, #15
@@ -1164,7 +130,7 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) #
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0 {
+define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
@@ -1176,633 +142,96 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #
   ret <8 x i16> %sel
 }
 
-define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, <16 x i1>* %c) #0 {
+define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    sub x9, sp, #48
-; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT:    ldrh w8, [x2]
 ; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    sbfx w9, w8, #15, #1
-; CHECK-NEXT:    sbfx w10, w8, #14, #1
-; CHECK-NEXT:    sbfx w11, w8, #13, #1
-; CHECK-NEXT:    strh w9, [sp, #30]
-; CHECK-NEXT:    sbfx w9, w8, #12, #1
-; CHECK-NEXT:    strh w10, [sp, #28]
-; CHECK-NEXT:    sbfx w10, w8, #11, #1
-; CHECK-NEXT:    strh w11, [sp, #26]
-; CHECK-NEXT:    sbfx w11, w8, #10, #1
-; CHECK-NEXT:    strh w9, [sp, #24]
-; CHECK-NEXT:    sbfx w9, w8, #9, #1
-; CHECK-NEXT:    strh w10, [sp, #22]
-; CHECK-NEXT:    sbfx w10, w8, #8, #1
-; CHECK-NEXT:    strh w11, [sp, #20]
-; CHECK-NEXT:    sbfx w11, w8, #7, #1
-; CHECK-NEXT:    strh w9, [sp, #18]
-; CHECK-NEXT:    sbfx w9, w8, #6, #1
-; CHECK-NEXT:    strh w10, [sp, #16]
-; CHECK-NEXT:    sbfx w10, w8, #5, #1
-; CHECK-NEXT:    strh w11, [sp, #14]
-; CHECK-NEXT:    sbfx w11, w8, #4, #1
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    sbfx w9, w8, #3, #1
-; CHECK-NEXT:    strh w10, [sp, #10]
-; CHECK-NEXT:    sbfx w10, w8, #2, #1
-; CHECK-NEXT:    strh w11, [sp, #8]
-; CHECK-NEXT:    sbfx w11, w8, #1, #1
-; CHECK-NEXT:    sbfx w8, w8, #0, #1
-; CHECK-NEXT:    strh w9, [sp, #6]
-; CHECK-NEXT:    strh w10, [sp, #4]
-; CHECK-NEXT:    strh w11, [sp, #2]
-; CHECK-NEXT:    strh w8, [sp]
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; CHECK-NEXT:    and z0.h, z0.h, #0x1
-; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    .cfi_restore w30
-; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    ret
-  %mask = load <16 x i1>, <16 x i1>* %c
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
+  %mask = icmp eq <16 x i16> %op1, %op2
   %sel = select <16 x i1> %mask, <16 x i16> %op1, <16 x i16> %op2
   store <16 x i16> %sel, <16 x i16>* %a
   ret void
 }
 
-define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, <32 x i1>* %c) #0 {
+define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+; VBITS_GE_256-LABEL: select_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z0.h, z2.h
+; VBITS_GE_256-NEXT:    cmpeq p2.h, p0/z, z1.h, z3.h
+; VBITS_GE_256-NEXT:    sel z0.h, p1, z0.h, z2.h
+; VBITS_GE_256-NEXT:    sel z1.h, p2, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: select_v32i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_512-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_512-NEXT:    mov x29, sp
-; VBITS_GE_512-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_512-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_512-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_512-NEXT:    sub x9, sp, #112
-; VBITS_GE_512-NEXT:    and sp, x9, #0xffffffffffffffc0
-; VBITS_GE_512-NEXT:    ldr w8, [x2]
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    ptrue p1.h
-; VBITS_GE_512-NEXT:    asr w9, w8, #31
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #30, #1
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #29, #1
-; VBITS_GE_512-NEXT:    strh w9, [sp, #62]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #28, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #60]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #27, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #58]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #26, #1
-; VBITS_GE_512-NEXT:    strh w9, [sp, #56]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #25, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #54]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #24, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #52]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #23, #1
-; VBITS_GE_512-NEXT:    strh w9, [sp, #50]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #22, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #48]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #21, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #46]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #20, #1
-; VBITS_GE_512-NEXT:    strh w9, [sp, #44]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #19, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #42]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #18, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #40]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #17, #1
-; VBITS_GE_512-NEXT:    strh w9, [sp, #38]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #16, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #36]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #15, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #34]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #14, #1
-; VBITS_GE_512-NEXT:    strh w9, [sp, #32]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #13, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #30]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #12, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #28]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #11, #1
-; VBITS_GE_512-NEXT:    strh w9, [sp, #26]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #10, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #24]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #9, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #22]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #8, #1
-; VBITS_GE_512-NEXT:    strh w9, [sp, #20]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #7, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #18]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #6, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #16]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #5, #1
-; VBITS_GE_512-NEXT:    strh w9, [sp, #14]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #4, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #12]
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #3, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #10]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #2, #1
-; VBITS_GE_512-NEXT:    strh w9, [sp, #8]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #1, #1
-; VBITS_GE_512-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #6]
-; VBITS_GE_512-NEXT:    strh w11, [sp, #4]
-; VBITS_GE_512-NEXT:    strh w9, [sp, #2]
-; VBITS_GE_512-NEXT:    strh w8, [sp]
-; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [sp]
-; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    and z0.h, z0.h, #0x1
-; VBITS_GE_512-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; VBITS_GE_512-NEXT:    sel z0.h, p1, z1.h, z2.h
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.h
+; VBITS_GE_512-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_512-NEXT:    mov sp, x29
-; VBITS_GE_512-NEXT:    .cfi_def_cfa wsp, 16
-; VBITS_GE_512-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_512-NEXT:    .cfi_def_cfa_offset 0
-; VBITS_GE_512-NEXT:    .cfi_restore w30
-; VBITS_GE_512-NEXT:    .cfi_restore w29
 ; VBITS_GE_512-NEXT:    ret
-  %mask = load <32 x i1>, <32 x i1>* %c
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
+  %mask = icmp eq <32 x i16> %op1, %op2
   %sel = select <32 x i1> %mask, <32 x i16> %op1, <32 x i16> %op2
   store <32 x i16> %sel, <32 x i16>* %a
   ret void
 }
 
-define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, <64 x i1>* %c) #0 {
-; VBITS_GE_1024-LABEL: select_v64i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_1024-NEXT:    mov x29, sp
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_1024-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_1024-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT:    sub x9, sp, #240
-; VBITS_GE_1024-NEXT:    and sp, x9, #0xffffffffffffff80
-; VBITS_GE_1024-NEXT:    ldr x8, [x2]
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ptrue p1.h
-; VBITS_GE_1024-NEXT:    asr x9, x8, #63
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #62, #1
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #61, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #126]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #60, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #124]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #59, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #122]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #58, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #120]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #57, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #118]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #56, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #116]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #55, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #114]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #54, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #112]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #53, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #110]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #52, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #108]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #51, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #106]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #50, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #104]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #49, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #102]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #48, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #100]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #47, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #98]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #46, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #96]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #45, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #94]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #44, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #92]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #43, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #90]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #42, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #88]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #41, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #86]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #40, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #84]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #39, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #82]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #38, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #80]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #37, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #78]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #36, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #76]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #35, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #74]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #34, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #72]
-; VBITS_GE_1024-NEXT:    sbfx x9, x8, #33, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #70]
-; VBITS_GE_1024-NEXT:    sbfx x10, x8, #32, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #68]
-; VBITS_GE_1024-NEXT:    asr w11, w8, #31
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #66]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #30, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #64]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #29, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #62]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #28, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #60]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #27, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #58]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #26, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #56]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #25, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #54]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #24, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #52]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #23, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #50]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #22, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #48]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #21, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #46]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #20, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #44]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #19, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #42]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #18, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #40]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #17, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #38]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #16, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #36]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #34]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #32]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #30]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #12, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #28]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #11, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #26]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #10, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #24]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #9, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #22]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #8, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #20]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #7, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #18]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #6, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #16]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #5, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #14]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #4, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #12]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #10]
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #8]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_1024-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_1024-NEXT:    strh w9, [sp, #6]
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #4]
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #2]
-; VBITS_GE_1024-NEXT:    strh w8, [sp]
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [sp]
-; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    and z0.h, z0.h, #0x1
-; VBITS_GE_1024-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; VBITS_GE_1024-NEXT:    sel z0.h, p1, z1.h, z2.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    mov sp, x29
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa wsp, 16
-; VBITS_GE_1024-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa_offset 0
-; VBITS_GE_1024-NEXT:    .cfi_restore w30
-; VBITS_GE_1024-NEXT:    .cfi_restore w29
-; VBITS_GE_1024-NEXT:    ret
-  %mask = load <64 x i1>, <64 x i1>* %c
+define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v64i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
+  %mask = icmp eq <64 x i16> %op1, %op2
   %sel = select <64 x i1> %mask, <64 x i16> %op1, <64 x i16> %op2
   store <64 x i16> %sel, <64 x i16>* %a
   ret void
 }
 
-define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, <128 x i1>* %c) #0 {
-; VBITS_GE_2048-LABEL: select_v128i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_2048-NEXT:    mov x29, sp
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_2048-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_2048-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT:    sub x9, sp, #496
-; VBITS_GE_2048-NEXT:    and sp, x9, #0xffffffffffffff00
-; VBITS_GE_2048-NEXT:    ldr x8, [x2, #8]
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ptrue p1.h
-; VBITS_GE_2048-NEXT:    asr x9, x8, #63
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #254]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #252]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #250]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #248]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #246]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #244]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #242]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #240]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #238]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #236]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #234]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #232]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #230]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #228]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #226]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #224]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #222]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #220]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #218]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #216]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #214]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #212]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #210]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #208]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #206]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #204]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #202]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #200]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #198]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #196]
-; VBITS_GE_2048-NEXT:    asr w11, w8, #31
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #194]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #192]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #190]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #188]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #186]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #184]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #182]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #180]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #178]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #176]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #174]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #172]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #170]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #168]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #166]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #164]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #162]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #160]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #158]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #156]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #154]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #152]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #150]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #148]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #146]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #144]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #142]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #140]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #138]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #136]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #134]
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #132]
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #130]
-; VBITS_GE_2048-NEXT:    strh w8, [sp, #128]
-; VBITS_GE_2048-NEXT:    ldr x8, [x2]
-; VBITS_GE_2048-NEXT:    asr x9, x8, #63
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #126]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #124]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #122]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #120]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #118]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #116]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #114]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #112]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #110]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #108]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #106]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #104]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #102]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #100]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #98]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #96]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #94]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #92]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #90]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #88]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #86]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #84]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #82]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #80]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #78]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #76]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #74]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #72]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #70]
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #68]
-; VBITS_GE_2048-NEXT:    asr w11, w8, #31
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #66]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #64]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #62]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #60]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #58]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #56]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #54]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #52]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #50]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #48]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #46]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #44]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #42]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #40]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #38]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #36]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #34]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #32]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #30]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #28]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #26]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #24]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #22]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #20]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #18]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #16]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #14]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #12]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #10]
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #8]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #6]
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #4]
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #2]
-; VBITS_GE_2048-NEXT:    strh w8, [sp]
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [sp]
-; VBITS_GE_2048-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    and z0.h, z0.h, #0x1
-; VBITS_GE_2048-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; VBITS_GE_2048-NEXT:    sel z0.h, p1, z1.h, z2.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    mov sp, x29
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa wsp, 16
-; VBITS_GE_2048-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa_offset 0
-; VBITS_GE_2048-NEXT:    .cfi_restore w30
-; VBITS_GE_2048-NEXT:    .cfi_restore w29
-; VBITS_GE_2048-NEXT:    ret
-  %mask = load <128 x i1>, <128 x i1>* %c
+define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v128i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
+  %mask = icmp eq <128 x i16> %op1, %op2
   %sel = select <128 x i1> %mask, <128 x i16> %op1, <128 x i16> %op2
   store <128 x i16> %sel, <128 x i16>* %a
   ret void
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) #0 {
+define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v2.2s, v2.2s, #31
@@ -1814,7 +243,7 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) #
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) #0 {
+define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
@@ -1826,332 +255,96 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) #
   ret <4 x i32> %sel
 }
 
-define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, <8 x i1>* %c) #0 {
+define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    sub x9, sp, #48
-; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT:    ldrb w8, [x2]
 ; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    sbfx w9, w8, #7, #1
-; CHECK-NEXT:    sbfx w10, w8, #6, #1
-; CHECK-NEXT:    sbfx w11, w8, #5, #1
-; CHECK-NEXT:    sbfx w12, w8, #4, #1
-; CHECK-NEXT:    stp w10, w9, [sp, #24]
-; CHECK-NEXT:    sbfx w9, w8, #3, #1
-; CHECK-NEXT:    sbfx w10, w8, #2, #1
-; CHECK-NEXT:    stp w12, w11, [sp, #16]
-; CHECK-NEXT:    sbfx w11, w8, #1, #1
-; CHECK-NEXT:    sbfx w8, w8, #0, #1
-; CHECK-NEXT:    stp w10, w9, [sp, #8]
-; CHECK-NEXT:    stp w8, w11, [sp]
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; CHECK-NEXT:    and z0.s, z0.s, #0x1
-; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    .cfi_restore w30
-; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    ret
-  %mask = load <8 x i1>, <8 x i1>* %c
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
+  %mask = icmp eq <8 x i32> %op1, %op2
   %sel = select <8 x i1> %mask, <8 x i32> %op1, <8 x i32> %op2
   store <8 x i32> %sel, <8 x i32>* %a
   ret void
 }
 
-define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, <16 x i1>* %c) #0 {
+define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
+; VBITS_GE_256-LABEL: select_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, z2.s
+; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, z3.s
+; VBITS_GE_256-NEXT:    sel z0.s, p1, z0.s, z2.s
+; VBITS_GE_256-NEXT:    sel z1.s, p2, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: select_v16i32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_512-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_512-NEXT:    mov x29, sp
-; VBITS_GE_512-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_512-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_512-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_512-NEXT:    sub x9, sp, #112
-; VBITS_GE_512-NEXT:    and sp, x9, #0xffffffffffffffc0
-; VBITS_GE_512-NEXT:    ldrh w8, [x2]
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ptrue p1.s
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_512-NEXT:    sbfx w12, w8, #12, #1
-; VBITS_GE_512-NEXT:    stp w10, w9, [sp, #56]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #11, #1
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #10, #1
-; VBITS_GE_512-NEXT:    stp w12, w11, [sp, #48]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #9, #1
-; VBITS_GE_512-NEXT:    sbfx w12, w8, #8, #1
-; VBITS_GE_512-NEXT:    stp w10, w9, [sp, #40]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #7, #1
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #6, #1
-; VBITS_GE_512-NEXT:    stp w12, w11, [sp, #32]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #5, #1
-; VBITS_GE_512-NEXT:    sbfx w12, w8, #4, #1
-; VBITS_GE_512-NEXT:    stp w10, w9, [sp, #24]
-; VBITS_GE_512-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_512-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_512-NEXT:    stp w12, w11, [sp, #16]
-; VBITS_GE_512-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_512-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_512-NEXT:    stp w10, w9, [sp, #8]
-; VBITS_GE_512-NEXT:    stp w8, w11, [sp]
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [sp]
-; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    and z0.s, z0.s, #0x1
-; VBITS_GE_512-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; VBITS_GE_512-NEXT:    sel z0.s, p1, z1.s, z2.s
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
+; VBITS_GE_512-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT:    mov sp, x29
-; VBITS_GE_512-NEXT:    .cfi_def_cfa wsp, 16
-; VBITS_GE_512-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_512-NEXT:    .cfi_def_cfa_offset 0
-; VBITS_GE_512-NEXT:    .cfi_restore w30
-; VBITS_GE_512-NEXT:    .cfi_restore w29
 ; VBITS_GE_512-NEXT:    ret
-  %mask = load <16 x i1>, <16 x i1>* %c
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
+  %mask = icmp eq <16 x i32> %op1, %op2
   %sel = select <16 x i1> %mask, <16 x i32> %op1, <16 x i32> %op2
   store <16 x i32> %sel, <16 x i32>* %a
   ret void
 }
 
-define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, <32 x i1>* %c) #0 {
-; VBITS_GE_1024-LABEL: select_v32i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_1024-NEXT:    mov x29, sp
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_1024-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_1024-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT:    sub x9, sp, #240
-; VBITS_GE_1024-NEXT:    and sp, x9, #0xffffffffffffff80
-; VBITS_GE_1024-NEXT:    ldr w8, [x2]
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ptrue p1.s
-; VBITS_GE_1024-NEXT:    asr w9, w8, #31
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #30, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #29, #1
-; VBITS_GE_1024-NEXT:    sbfx w12, w8, #28, #1
-; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #120]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #27, #1
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #26, #1
-; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #112]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #25, #1
-; VBITS_GE_1024-NEXT:    sbfx w12, w8, #24, #1
-; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #104]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #23, #1
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #22, #1
-; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #96]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #21, #1
-; VBITS_GE_1024-NEXT:    sbfx w12, w8, #20, #1
-; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #88]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #19, #1
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #18, #1
-; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #80]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #17, #1
-; VBITS_GE_1024-NEXT:    sbfx w12, w8, #16, #1
-; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #72]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #64]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_1024-NEXT:    sbfx w12, w8, #12, #1
-; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #56]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #11, #1
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #10, #1
-; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #48]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #9, #1
-; VBITS_GE_1024-NEXT:    sbfx w12, w8, #8, #1
-; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #40]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #7, #1
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #6, #1
-; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #32]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #5, #1
-; VBITS_GE_1024-NEXT:    sbfx w12, w8, #4, #1
-; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #24]
-; VBITS_GE_1024-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_1024-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #16]
-; VBITS_GE_1024-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_1024-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #8]
-; VBITS_GE_1024-NEXT:    stp w8, w11, [sp]
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [sp]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    and z0.s, z0.s, #0x1
-; VBITS_GE_1024-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; VBITS_GE_1024-NEXT:    sel z0.s, p1, z1.s, z2.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    mov sp, x29
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa wsp, 16
-; VBITS_GE_1024-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa_offset 0
-; VBITS_GE_1024-NEXT:    .cfi_restore w30
-; VBITS_GE_1024-NEXT:    .cfi_restore w29
-; VBITS_GE_1024-NEXT:    ret
-  %mask = load <32 x i1>, <32 x i1>* %c
+define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
+  %mask = icmp eq <32 x i32> %op1, %op2
   %sel = select <32 x i1> %mask, <32 x i32> %op1, <32 x i32> %op2
   store <32 x i32> %sel, <32 x i32>* %a
   ret void
 }
 
-define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, <64 x i1>* %c) #0 {
-; VBITS_GE_2048-LABEL: select_v64i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_2048-NEXT:    mov x29, sp
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_2048-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_2048-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT:    sub x9, sp, #496
-; VBITS_GE_2048-NEXT:    and sp, x9, #0xffffffffffffff00
-; VBITS_GE_2048-NEXT:    ldr x8, [x2]
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ptrue p1.s
-; VBITS_GE_2048-NEXT:    asr x9, x8, #63
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x8, #60, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #248]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #59, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #58, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #240]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #57, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x8, #56, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #232]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #55, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #54, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #224]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #53, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x8, #52, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #216]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #208]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x8, #48, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #200]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #47, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #46, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #192]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #45, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x8, #44, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #184]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #43, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #42, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #176]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #41, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x8, #40, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #168]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #160]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x8, #36, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #152]
-; VBITS_GE_2048-NEXT:    sbfx x9, x8, #35, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x8, #34, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #144]
-; VBITS_GE_2048-NEXT:    sbfx x11, x8, #33, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x8, #32, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #136]
-; VBITS_GE_2048-NEXT:    asr w9, w8, #31
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #30, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #128]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #29, #1
-; VBITS_GE_2048-NEXT:    sbfx w12, w8, #28, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #120]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #112]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
-; VBITS_GE_2048-NEXT:    sbfx w12, w8, #24, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #104]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #23, #1
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #22, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #96]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #21, #1
-; VBITS_GE_2048-NEXT:    sbfx w12, w8, #20, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #88]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #19, #1
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #18, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #80]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #17, #1
-; VBITS_GE_2048-NEXT:    sbfx w12, w8, #16, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #72]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #64]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
-; VBITS_GE_2048-NEXT:    sbfx w12, w8, #12, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #56]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #11, #1
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #10, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #48]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #9, #1
-; VBITS_GE_2048-NEXT:    sbfx w12, w8, #8, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #40]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #7, #1
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #6, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #32]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #5, #1
-; VBITS_GE_2048-NEXT:    sbfx w12, w8, #4, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #24]
-; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
-; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
-; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #16]
-; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
-; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #8]
-; VBITS_GE_2048-NEXT:    stp w8, w11, [sp]
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [sp]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    and z0.s, z0.s, #0x1
-; VBITS_GE_2048-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; VBITS_GE_2048-NEXT:    sel z0.s, p1, z1.s, z2.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    mov sp, x29
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa wsp, 16
-; VBITS_GE_2048-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa_offset 0
-; VBITS_GE_2048-NEXT:    .cfi_restore w30
-; VBITS_GE_2048-NEXT:    .cfi_restore w29
-; VBITS_GE_2048-NEXT:    ret
-  %mask = load <64 x i1>, <64 x i1>* %c
+define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v64i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
+  %mask = icmp eq <64 x i32> %op1, %op2
   %sel = select <64 x i1> %mask, <64 x i32> %op1, <64 x i32> %op2
   store <64 x i32> %sel, <64 x i32>* %a
   ret void
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) #0 {
+define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tst w0, #0x1
@@ -2164,7 +357,7 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) #
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) #0 {
+define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
@@ -2176,322 +369,89 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) #
   ret <2 x i64> %sel
 }
 
-define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, <4 x i1>* %c) #0 {
+define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    sub x9, sp, #48
-; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT:    ldrb w8, [x2]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    lsr w9, w8, #3
-; CHECK-NEXT:    lsr w10, w8, #2
-; CHECK-NEXT:    sbfx x11, x8, #0, #1
-; CHECK-NEXT:    lsr w8, w8, #1
-; CHECK-NEXT:    sbfx x9, x9, #0, #1
-; CHECK-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    stp x10, x9, [sp, #16]
-; CHECK-NEXT:    stp x11, x8, [sp]
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; CHECK-NEXT:    and z0.d, z0.d, #0x1
-; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    .cfi_restore w30
-; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    ret
-  %mask = load <4 x i1>, <4 x i1>* %c
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
+  %mask = icmp eq <4 x i64> %op1, %op2
   %sel = select <4 x i1> %mask, <4 x i64> %op1, <4 x i64> %op2
   store <4 x i64> %sel, <4 x i64>* %a
   ret void
 }
 
-define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, <8 x i1>* %c) #0 {
+define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
+; VBITS_GE_256-LABEL: select_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, z2.d
+; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, z3.d
+; VBITS_GE_256-NEXT:    sel z0.d, p1, z0.d, z2.d
+; VBITS_GE_256-NEXT:    sel z1.d, p2, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: select_v8i64:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_512-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_512-NEXT:    mov x29, sp
-; VBITS_GE_512-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_512-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_512-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_512-NEXT:    sub x9, sp, #112
-; VBITS_GE_512-NEXT:    and sp, x9, #0xffffffffffffffc0
-; VBITS_GE_512-NEXT:    ldrb w8, [x2]
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ptrue p1.d
-; VBITS_GE_512-NEXT:    lsr w9, w8, #7
-; VBITS_GE_512-NEXT:    lsr w10, w8, #6
-; VBITS_GE_512-NEXT:    lsr w11, w8, #5
-; VBITS_GE_512-NEXT:    lsr w12, w8, #4
-; VBITS_GE_512-NEXT:    sbfx x9, x9, #0, #1
-; VBITS_GE_512-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_512-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_512-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_512-NEXT:    lsr w13, w8, #3
-; VBITS_GE_512-NEXT:    stp x10, x9, [sp, #48]
-; VBITS_GE_512-NEXT:    lsr w9, w8, #2
-; VBITS_GE_512-NEXT:    stp x12, x11, [sp, #32]
-; VBITS_GE_512-NEXT:    sbfx x11, x8, #0, #1
-; VBITS_GE_512-NEXT:    lsr w8, w8, #1
-; VBITS_GE_512-NEXT:    sbfx x10, x13, #0, #1
-; VBITS_GE_512-NEXT:    sbfx x9, x9, #0, #1
-; VBITS_GE_512-NEXT:    sbfx x8, x8, #0, #1
-; VBITS_GE_512-NEXT:    stp x9, x10, [sp, #16]
-; VBITS_GE_512-NEXT:    stp x11, x8, [sp]
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [sp]
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    and z0.d, z0.d, #0x1
-; VBITS_GE_512-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; VBITS_GE_512-NEXT:    sel z0.d, p1, z1.d, z2.d
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
+; VBITS_GE_512-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_512-NEXT:    mov sp, x29
-; VBITS_GE_512-NEXT:    .cfi_def_cfa wsp, 16
-; VBITS_GE_512-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_512-NEXT:    .cfi_def_cfa_offset 0
-; VBITS_GE_512-NEXT:    .cfi_restore w30
-; VBITS_GE_512-NEXT:    .cfi_restore w29
 ; VBITS_GE_512-NEXT:    ret
-  %mask = load <8 x i1>, <8 x i1>* %c
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
+  %mask = icmp eq <8 x i64> %op1, %op2
   %sel = select <8 x i1> %mask, <8 x i64> %op1, <8 x i64> %op2
   store <8 x i64> %sel, <8 x i64>* %a
   ret void
 }
 
-define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, <16 x i1>* %c) #0 {
-; VBITS_GE_1024-LABEL: select_v16i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_1024-NEXT:    mov x29, sp
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_1024-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_1024-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT:    sub x9, sp, #240
-; VBITS_GE_1024-NEXT:    and sp, x9, #0xffffffffffffff80
-; VBITS_GE_1024-NEXT:    ldrh w8, [x2]
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ptrue p1.d
-; VBITS_GE_1024-NEXT:    lsr w9, w8, #15
-; VBITS_GE_1024-NEXT:    lsr w10, w8, #14
-; VBITS_GE_1024-NEXT:    lsr w11, w8, #13
-; VBITS_GE_1024-NEXT:    lsr w12, w8, #12
-; VBITS_GE_1024-NEXT:    sbfx x9, x9, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_1024-NEXT:    lsr w13, w8, #11
-; VBITS_GE_1024-NEXT:    lsr w14, w8, #10
-; VBITS_GE_1024-NEXT:    stp x10, x9, [sp, #112]
-; VBITS_GE_1024-NEXT:    lsr w9, w8, #9
-; VBITS_GE_1024-NEXT:    stp x12, x11, [sp, #96]
-; VBITS_GE_1024-NEXT:    lsr w12, w8, #8
-; VBITS_GE_1024-NEXT:    sbfx x10, x13, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x11, x14, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x9, x9, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_1024-NEXT:    lsr w13, w8, #3
-; VBITS_GE_1024-NEXT:    stp x11, x10, [sp, #80]
-; VBITS_GE_1024-NEXT:    lsr w10, w8, #6
-; VBITS_GE_1024-NEXT:    stp x12, x9, [sp, #64]
-; VBITS_GE_1024-NEXT:    lsr w9, w8, #7
-; VBITS_GE_1024-NEXT:    lsr w11, w8, #5
-; VBITS_GE_1024-NEXT:    lsr w12, w8, #4
-; VBITS_GE_1024-NEXT:    sbfx x9, x9, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_1024-NEXT:    stp x10, x9, [sp, #48]
-; VBITS_GE_1024-NEXT:    lsr w10, w8, #2
-; VBITS_GE_1024-NEXT:    stp x12, x11, [sp, #32]
-; VBITS_GE_1024-NEXT:    sbfx x11, x8, #0, #1
-; VBITS_GE_1024-NEXT:    lsr w8, w8, #1
-; VBITS_GE_1024-NEXT:    sbfx x9, x13, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x8, x8, #0, #1
-; VBITS_GE_1024-NEXT:    stp x10, x9, [sp, #16]
-; VBITS_GE_1024-NEXT:    stp x11, x8, [sp]
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [sp]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    and z0.d, z0.d, #0x1
-; VBITS_GE_1024-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; VBITS_GE_1024-NEXT:    sel z0.d, p1, z1.d, z2.d
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    mov sp, x29
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa wsp, 16
-; VBITS_GE_1024-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_1024-NEXT:    .cfi_def_cfa_offset 0
-; VBITS_GE_1024-NEXT:    .cfi_restore w30
-; VBITS_GE_1024-NEXT:    .cfi_restore w29
-; VBITS_GE_1024-NEXT:    ret
-  %mask = load <16 x i1>, <16 x i1>* %c
+define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: select_v16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
+  %mask = icmp eq <16 x i64> %op1, %op2
   %sel = select <16 x i1> %mask, <16 x i64> %op1, <16 x i64> %op2
   store <16 x i64> %sel, <16 x i64>* %a
   ret void
 }
 
-define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, <32 x i1>* %c) #0 {
-; VBITS_GE_2048-LABEL: select_v32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_2048-NEXT:    mov x29, sp
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_2048-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_2048-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT:    sub x9, sp, #496
-; VBITS_GE_2048-NEXT:    and sp, x9, #0xffffffffffffff00
-; VBITS_GE_2048-NEXT:    ldr w8, [x2]
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d
-; VBITS_GE_2048-NEXT:    ubfx x9, x8, #31, #1
-; VBITS_GE_2048-NEXT:    ubfx x10, x8, #30, #2
-; VBITS_GE_2048-NEXT:    // kill: def $w9 killed $w9 killed $x9 def $x9
-; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT:    ubfx x11, x8, #29, #3
-; VBITS_GE_2048-NEXT:    ubfx x12, x8, #28, #4
-; VBITS_GE_2048-NEXT:    sbfx x9, x9, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    ubfx x13, x8, #27, #5
-; VBITS_GE_2048-NEXT:    ubfx x14, x8, #26, #6
-; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #240]
-; VBITS_GE_2048-NEXT:    sbfx x9, x11, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x12, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x13, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x10, x8, #25, #7
-; VBITS_GE_2048-NEXT:    ubfx x13, x8, #23, #9
-; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    stp x11, x9, [sp, #224]
-; VBITS_GE_2048-NEXT:    sbfx x9, x14, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x8, #24, #8
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    stp x9, x12, [sp, #208]
-; VBITS_GE_2048-NEXT:    sbfx x9, x10, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x10, x8, #22, #10
-; VBITS_GE_2048-NEXT:    sbfx x12, x13, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT:    ubfx x13, x8, #21, #11
-; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    stp x11, x9, [sp, #192]
-; VBITS_GE_2048-NEXT:    sbfx x9, x10, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x10, x8, #20, #12
-; VBITS_GE_2048-NEXT:    ubfx x11, x8, #19, #13
-; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    stp x9, x12, [sp, #176]
-; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x12, x8, #18, #14
-; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    ubfx x13, x8, #17, #15
-; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #160]
-; VBITS_GE_2048-NEXT:    sbfx x9, x12, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x10, x8, #16, #16
-; VBITS_GE_2048-NEXT:    ubfx x12, x8, #15, #17
-; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    stp x9, x11, [sp, #144]
-; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x8, #14, #18
-; VBITS_GE_2048-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    ubfx x13, x8, #13, #19
-; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #128]
-; VBITS_GE_2048-NEXT:    sbfx x9, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x10, x8, #12, #20
-; VBITS_GE_2048-NEXT:    ubfx x11, x8, #11, #21
-; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    stp x9, x12, [sp, #112]
-; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x12, x8, #10, #22
-; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    ubfx x13, x8, #9, #23
-; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #96]
-; VBITS_GE_2048-NEXT:    sbfx x9, x12, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x10, x8, #8, #24
-; VBITS_GE_2048-NEXT:    ubfx x12, x8, #7, #25
-; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    stp x9, x11, [sp, #80]
-; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x8, #6, #26
-; VBITS_GE_2048-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    ubfx x13, x8, #5, #27
-; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #64]
-; VBITS_GE_2048-NEXT:    sbfx x9, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x10, x8, #4, #28
-; VBITS_GE_2048-NEXT:    ubfx x11, x8, #3, #29
-; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    stp x9, x12, [sp, #48]
-; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x12, x8, #2, #30
-; VBITS_GE_2048-NEXT:    ubfx x13, x8, #1, #31
-; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    sbfx x8, x8, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #32]
-; VBITS_GE_2048-NEXT:    sbfx x9, x12, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x10, x13, #0, #1
-; VBITS_GE_2048-NEXT:    stp x9, x11, [sp, #16]
-; VBITS_GE_2048-NEXT:    stp x8, x10, [sp]
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [sp]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    and z0.d, z0.d, #0x1
-; VBITS_GE_2048-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; VBITS_GE_2048-NEXT:    sel z0.d, p1, z1.d, z2.d
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    mov sp, x29
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa wsp, 16
-; VBITS_GE_2048-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_2048-NEXT:    .cfi_def_cfa_offset 0
-; VBITS_GE_2048-NEXT:    .cfi_restore w30
-; VBITS_GE_2048-NEXT:    .cfi_restore w29
-; VBITS_GE_2048-NEXT:    ret
-  %mask = load <32 x i1>, <32 x i1>* %c
+define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: select_v32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
+  %mask = icmp eq <32 x i64> %op1, %op2
   %sel = select <32 x i1> %mask, <32 x i64> %op1, <32 x i64> %op2
   store <32 x i64> %sel, <32 x i64>* %a
   ret void

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
index a4aba59034e7..485df06b8964 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
@@ -1,35 +1,29 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
-
-; VBYTES represents the useful byte size of a vector register from the code
-; generator's point of view. It is clamped to power-of-2 values because
-; only power-of-2 vector lengths are considered legal, regardless of the
-; user specified vector length.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ; Don't use SVE for 64-bit vectors.
 define <2 x float> @load_v2f32(<2 x float>* %a) #0 {
 ; CHECK-LABEL: load_v2f32:
-; CHECK: ldr d0, [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
   %load = load <2 x float>, <2 x float>* %a
   ret <2 x float> %load
 }
@@ -37,66 +31,164 @@ define <2 x float> @load_v2f32(<2 x float>* %a) #0 {
 ; Don't use SVE for 128-bit vectors.
 define <4 x float> @load_v4f32(<4 x float>* %a) #0 {
 ; CHECK-LABEL: load_v4f32:
-; CHECK: ldr q0, [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
   %load = load <4 x float>, <4 x float>* %a
   ret <4 x float> %load
 }
 
 define <8 x float> @load_v8f32(<8 x float>* %a) #0 {
 ; CHECK-LABEL: load_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
   %load = load <8 x float>, <8 x float>* %a
   ret <8 x float> %load
 }
 
 define <16 x float> @load_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: load_v16f32:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
-; VBITS_LE_256-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
-; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A1]], lsl #2]
-; CHECK: ret
+; VBITS_GE_256-LABEL: load_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: load_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT:    ret
+;
+; VBITS_GE_1024-LABEL: load_v16f32:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8]
+; VBITS_GE_1024-NEXT:    ret
+;
+; VBITS_GE_2048-LABEL: load_v16f32:
+; VBITS_GE_2048:       // %bb.0:
+; VBITS_GE_2048-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x8]
+; VBITS_GE_2048-NEXT:    ret
   %load = load <16 x float>, <16 x float>* %a
   ret <16 x float> %load
 }
 
 define <32 x float> @load_v32f32(<32 x float>* %a) #0 {
-; CHECK-LABEL: load_v32f32:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
-; VBITS_LE_512-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
-; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A1]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]]
-; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A2]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]]
-; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A3]], lsl #2]
-; CHECK: ret
+; VBITS_GE_256-LABEL: load_v32f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #16
+; VBITS_GE_256-NEXT:    mov x10, #24
+; VBITS_GE_256-NEXT:    mov x11, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x10, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x8, x11, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: load_v32f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    mov x9, #16
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_512-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_512-NEXT:    ret
+;
+; VBITS_GE_1024-LABEL: load_v32f32:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8]
+; VBITS_GE_1024-NEXT:    ret
+;
+; VBITS_GE_2048-LABEL: load_v32f32:
+; VBITS_GE_2048:       // %bb.0:
+; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x8]
+; VBITS_GE_2048-NEXT:    ret
   %load = load <32 x float>, <32 x float>* %a
   ret <32 x float> %load
 }
 
 define <64 x float> @load_v64f32(<64 x float>* %a) #0 {
-; CHECK-LABEL: load_v64f32:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
-; VBITS_LE_1024-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
-; VBITS_LE_1024-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A1]], lsl #2]
-; VBITS_LE_512-DAG:  mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]]
-; VBITS_LE_512-DAG:  ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A2]], lsl #2]
-; VBITS_LE_512-DAG:  mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]]
-; VBITS_LE_512-DAG:  ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A3]], lsl #2]
-; VBITS_LE_256-DAG:  mov x[[A4:[0-9]+]], #[[#mul(div(VBYTES,4),4)]]
-; VBITS_LE_256-DAG:  ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A4]], lsl #2]
-; VBITS_LE_256-DAG:  mov x[[A5:[0-9]+]], #[[#mul(div(VBYTES,4),5)]]
-; VBITS_LE_256-DAG:  ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A5]], lsl #2]
-; VBITS_LE_256-DAG:  mov x[[A6:[0-9]+]], #[[#mul(div(VBYTES,4),6)]]
-; VBITS_LE_256-DAG:  ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A6]], lsl #2]
-; VBITS_LE_256-DAG:  mov x[[A7:[0-9]+]], #[[#mul(div(VBYTES,4),7)]]
-; VBITS_LE_256-DAG:  ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A7]], lsl #2]
-; CHECK: ret
+; VBITS_GE_256-LABEL: load_v64f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    mov x10, #48
+; VBITS_GE_256-NEXT:    mov x11, #56
+; VBITS_GE_256-NEXT:    mov x12, #32
+; VBITS_GE_256-NEXT:    mov x13, #40
+; VBITS_GE_256-NEXT:    mov x14, #16
+; VBITS_GE_256-NEXT:    mov x15, #24
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x10, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x11, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x12, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x13, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x14, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x15, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x11, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x10, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x8, x13, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x8, x12, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x8, x15, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x8, x14, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: load_v64f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    mov x9, #32
+; VBITS_GE_512-NEXT:    mov x10, #48
+; VBITS_GE_512-NEXT:    mov x11, #16
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
+; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
+; VBITS_GE_512-NEXT:    ld1w { z3.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    st1w { z1.s }, p0, [x8, x10, lsl #2]
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_512-NEXT:    st1w { z2.s }, p0, [x8, x11, lsl #2]
+; VBITS_GE_512-NEXT:    st1w { z3.s }, p0, [x8]
+; VBITS_GE_512-NEXT:    ret
+;
+; VBITS_GE_1024-LABEL: load_v64f32:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    mov x9, #32
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_1024-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_1024-NEXT:    ret
+;
+; VBITS_GE_2048-LABEL: load_v64f32:
+; VBITS_GE_2048:       // %bb.0:
+; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
+; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x8]
+; VBITS_GE_2048-NEXT:    ret
   %load = load <64 x float>, <64 x float>* %a
   ret <64 x float> %load
 }

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
index 28f354f47b19..743aa295d75e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
@@ -1,343 +1,363 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; ANDV
 ;
 
 ; No single instruction NEON ANDV support. Use SVE.
-define i8 @andv_v8i8(<8 x i8> %a) #0 {
+define i8 @andv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v8i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
-; CHECK: andv b[[REDUCE:[0-9]+]], [[PG]], z0.b
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    andv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
 ; No single instruction NEON ANDV support. Use SVE.
-define i8 @andv_v16i8(<16 x i8> %a) #0 {
+define i8 @andv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v16i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
-; CHECK: andv b[[REDUCE:[0-9]+]], [[PG]], z0.b
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    andv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
-define i8 @andv_v32i8(<32 x i8>* %a) #0 {
+define i8 @andv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    andv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i8>, <32 x i8>* %a
   %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
 define i8 @andv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: andv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: andv b[[REDUCE:[0-9]+]], [[PG]], [[AND]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
-
+; VBITS_GE_256-LABEL: andv_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    and z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    andv b0, p0, z0.b
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: andv_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    andv b0, p0, z0.b
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, <64 x i8>* %a
   %res = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
-define i8 @andv_v128i8(<128 x i8>* %a) #0 {
+define i8 @andv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: andv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    andv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %res = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
-define i8 @andv_v256i8(<256 x i8>* %a) #0 {
+define i8 @andv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: andv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    andv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %res = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
 ; No single instruction NEON ANDV support. Use SVE.
-define i16 @andv_v4i16(<4 x i16> %a) #0 {
+define i16 @andv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v4i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK: andv h[[REDUCE:[0-9]+]], [[PG]], z0.h
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    andv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
 ; No single instruction NEON ANDV support. Use SVE.
-define i16 @andv_v8i16(<8 x i16> %a) #0 {
+define i16 @andv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v8i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK: andv h[[REDUCE:[0-9]+]], [[PG]], z0.h
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    andv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
-define i16 @andv_v16i16(<16 x i16>* %a) #0 {
+define i16 @andv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    andv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
 define i16 @andv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: andv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: andv h[[REDUCE:[0-9]+]], [[PG]], [[AND]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: andv_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    and z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    andv h0, p0, z0.h
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: andv_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    andv h0, p0, z0.h
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
-define i16 @andv_v64i16(<64 x i16>* %a) #0 {
+define i16 @andv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: andv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    andv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
-define i16 @andv_v128i16(<128 x i16>* %a) #0 {
+define i16 @andv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: andv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    andv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
 ; No single instruction NEON ANDV support. Use SVE.
-define i32 @andv_v2i32(<2 x i32> %a) #0 {
+define i32 @andv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK: andv [[REDUCE:s[0-9]+]], [[PG]], z0.s
-; CHECK: fmov w0, [[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    andv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
 ; No single instruction NEON ANDV support. Use SVE.
-define i32 @andv_v4i32(<4 x i32> %a) #0 {
+define i32 @andv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK: andv [[REDUCE:s[0-9]+]], [[PG]], z0.s
-; CHECK: fmov w0, [[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    andv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
-define i32 @andv_v8i32(<8 x i32>* %a) #0 {
+define i32 @andv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    andv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
 define i32 @andv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: andv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: andv [[REDUCE:s[0-9]+]], [[PG]], [[AND]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: andv_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    and z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    andv s0, p0, z0.s
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: andv_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    andv s0, p0, z0.s
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
-define i32 @andv_v32i32(<32 x i32>* %a) #0 {
+define i32 @andv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: andv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    andv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
-define i32 @andv_v64i32(<64 x i32>* %a) #0 {
+define i32 @andv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: andv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    andv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
 ; Nothing to do for single element vectors.
-define i64 @andv_v1i64(<1 x i64> %a) #0 {
+define i64 @andv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
 ; Use SVE for 128-bit vectors
-define i64 @andv_v2i64(<2 x i64> %a) #0 {
+define i64 @andv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK: andv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK: fmov x0, [[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    andv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
-define i64 @andv_v4i64(<4 x i64>* %a) #0 {
+define i64 @andv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    andv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
 define i64 @andv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: andv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: andv [[REDUCE:d[0-9]+]], [[PG]], [[AND]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: andv_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    and z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    andv d0, p0, z0.d
+; VBITS_GE_256-NEXT:    fmov x0, d0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: andv_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    andv d0, p0, z0.d
+; VBITS_GE_512-NEXT:    fmov x0, d0
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
-define i64 @andv_v16i64(<16 x i64>* %a) #0 {
+define i64 @andv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: andv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    andv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
-define i64 @andv_v32i64(<32 x i64>* %a) #0 {
+define i64 @andv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: andv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    andv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %op)
   ret i64 %res
@@ -348,319 +368,354 @@ define i64 @andv_v32i64(<32 x i64>* %a) #0 {
 ;
 
 ; No single instruction NEON EORV support. Use SVE.
-define i8 @eorv_v8i8(<8 x i8> %a) #0 {
+define i8 @eorv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v8i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
-; CHECK: eorv b[[REDUCE:[0-9]+]], [[PG]], z0.b
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    eorv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
 ; No single instruction NEON EORV support. Use SVE.
-define i8 @eorv_v16i8(<16 x i8> %a) #0 {
+define i8 @eorv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v16i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
-; CHECK: eorv b[[REDUCE:[0-9]+]], [[PG]], z0.b
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    eorv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
-define i8 @eorv_v32i8(<32 x i8>* %a) #0 {
+define i8 @eorv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    eorv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i8>, <32 x i8>* %a
   %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
 define i8 @eorv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: eorv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: eorv b[[REDUCE:[0-9]+]], [[PG]], [[EOR]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
-
+; VBITS_GE_256-LABEL: eorv_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    eor z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    eorv b0, p0, z0.b
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: eorv_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    eorv b0, p0, z0.b
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, <64 x i8>* %a
   %res = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
-define i8 @eorv_v128i8(<128 x i8>* %a) #0 {
+define i8 @eorv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: eorv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    eorv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %res = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
-define i8 @eorv_v256i8(<256 x i8>* %a) #0 {
+define i8 @eorv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: eorv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    eorv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %res = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
 ; No single instruction NEON EORV support. Use SVE.
-define i16 @eorv_v4i16(<4 x i16> %a) #0 {
+define i16 @eorv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v4i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK: eorv h[[REDUCE:[0-9]+]], [[PG]], z0.h
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    eorv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
 ; No single instruction NEON EORV support. Use SVE.
-define i16 @eorv_v8i16(<8 x i16> %a) #0 {
+define i16 @eorv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v8i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK: eorv h[[REDUCE:[0-9]+]], [[PG]], z0.h
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    eorv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
-define i16 @eorv_v16i16(<16 x i16>* %a) #0 {
+define i16 @eorv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    eorv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
 define i16 @eorv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: eorv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: eorv h[[REDUCE:[0-9]+]], [[PG]], [[EOR]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: eorv_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    eor z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    eorv h0, p0, z0.h
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: eorv_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    eorv h0, p0, z0.h
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
-define i16 @eorv_v64i16(<64 x i16>* %a) #0 {
+define i16 @eorv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: eorv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    eorv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
-define i16 @eorv_v128i16(<128 x i16>* %a) #0 {
+define i16 @eorv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: eorv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    eorv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
 ; No single instruction NEON EORV support. Use SVE.
-define i32 @eorv_v2i32(<2 x i32> %a) #0 {
+define i32 @eorv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK: eorv [[REDUCE:s[0-9]+]], [[PG]], z0.s
-; CHECK: fmov w0, [[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    eorv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
 ; No single instruction NEON EORV support. Use SVE.
-define i32 @eorv_v4i32(<4 x i32> %a) #0 {
+define i32 @eorv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK: eorv [[REDUCE:s[0-9]+]], [[PG]], z0.s
-; CHECK: fmov w0, [[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    eorv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
-define i32 @eorv_v8i32(<8 x i32>* %a) #0 {
+define i32 @eorv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    eorv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
 define i32 @eorv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: eorv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: eorv [[REDUCE:s[0-9]+]], [[PG]], [[EOR]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: eorv_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    eor z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    eorv s0, p0, z0.s
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: eorv_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    eorv s0, p0, z0.s
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
-define i32 @eorv_v32i32(<32 x i32>* %a) #0 {
+define i32 @eorv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: eorv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    eorv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
-define i32 @eorv_v64i32(<64 x i32>* %a) #0 {
+define i32 @eorv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: eorv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    eorv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
 ; Nothing to do for single element vectors.
-define i64 @eorv_v1i64(<1 x i64> %a) #0 {
+define i64 @eorv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
 ; Use SVE for 128-bit vectors
-define i64 @eorv_v2i64(<2 x i64> %a) #0 {
+define i64 @eorv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK: eorv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK: fmov x0, [[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    eorv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
-define i64 @eorv_v4i64(<4 x i64>* %a) #0 {
+define i64 @eorv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    eorv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
 define i64 @eorv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: eorv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: eorv [[REDUCE:d[0-9]+]], [[PG]], [[EOR]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: eorv_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    eor z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    eorv d0, p0, z0.d
+; VBITS_GE_256-NEXT:    fmov x0, d0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: eorv_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    eorv d0, p0, z0.d
+; VBITS_GE_512-NEXT:    fmov x0, d0
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
-define i64 @eorv_v16i64(<16 x i64>* %a) #0 {
+define i64 @eorv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: eorv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    eorv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
-define i64 @eorv_v32i64(<32 x i64>* %a) #0 {
+define i64 @eorv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: eorv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    eorv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %op)
   ret i64 %res
@@ -671,319 +726,354 @@ define i64 @eorv_v32i64(<32 x i64>* %a) #0 {
 ;
 
 ; No single instruction NEON ORV support. Use SVE.
-define i8 @orv_v8i8(<8 x i8> %a) #0 {
+define i8 @orv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v8i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
-; CHECK: orv b[[REDUCE:[0-9]+]], [[PG]], z0.b
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    orv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
 ; No single instruction NEON ORV support. Use SVE.
-define i8 @orv_v16i8(<16 x i8> %a) #0 {
+define i8 @orv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v16i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
-; CHECK: orv b[[REDUCE:[0-9]+]], [[PG]], z0.b
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    orv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
-define i8 @orv_v32i8(<32 x i8>* %a) #0 {
+define i8 @orv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    orv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i8>, <32 x i8>* %a
   %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
 define i8 @orv_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: orv_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: orv b[[REDUCE:[0-9]+]], [[PG]], [[OR]].b
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
-
+; VBITS_GE_256-LABEL: orv_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    orr z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    orv b0, p0, z0.b
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: orv_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    orv b0, p0, z0.b
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, <64 x i8>* %a
   %res = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
-define i8 @orv_v128i8(<128 x i8>* %a) #0 {
+define i8 @orv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: orv_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    orv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %res = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
-define i8 @orv_v256i8(<256 x i8>* %a) #0 {
+define i8 @orv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: orv_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    orv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %res = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
 ; No single instruction NEON ORV support. Use SVE.
-define i16 @orv_v4i16(<4 x i16> %a) #0 {
+define i16 @orv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v4i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK: orv h[[REDUCE:[0-9]+]], [[PG]], z0.h
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    orv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
 ; No single instruction NEON ORV support. Use SVE.
-define i16 @orv_v8i16(<8 x i16> %a) #0 {
+define i16 @orv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v8i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK: orv h[[REDUCE:[0-9]+]], [[PG]], z0.h
-; CHECK: fmov w0, s[[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    orv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
-define i16 @orv_v16i16(<16 x i16>* %a) #0 {
+define i16 @orv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; CHECK-NEXT: fmov w0, s[[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    orv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
 define i16 @orv_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: orv_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: orv h[[REDUCE:[0-9]+]], [[PG]], [[OR]].h
-; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: orv_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    orr z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    orv h0, p0, z0.h
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: orv_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    orv h0, p0, z0.h
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
-define i16 @orv_v64i16(<64 x i16>* %a) #0 {
+define i16 @orv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: orv_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    orv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
-define i16 @orv_v128i16(<128 x i16>* %a) #0 {
+define i16 @orv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: orv_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
-; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    orv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
 ; No single instruction NEON ORV support. Use SVE.
-define i32 @orv_v2i32(<2 x i32> %a) #0 {
+define i32 @orv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK: orv [[REDUCE:s[0-9]+]], [[PG]], z0.s
-; CHECK: fmov w0, [[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    orv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
 ; No single instruction NEON ORV support. Use SVE.
-define i32 @orv_v4i32(<4 x i32> %a) #0 {
+define i32 @orv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK: orv [[REDUCE:s[0-9]+]], [[PG]], z0.s
-; CHECK: fmov w0, [[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    orv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
-define i32 @orv_v8i32(<8 x i32>* %a) #0 {
+define i32 @orv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; CHECK-NEXT: fmov w0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    orv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
 define i32 @orv_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: orv_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: orv [[REDUCE:s[0-9]+]], [[PG]], [[OR]].s
-; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: orv_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    orr z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    orv s0, p0, z0.s
+; VBITS_GE_256-NEXT:    fmov w0, s0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: orv_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    orv s0, p0, z0.s
+; VBITS_GE_512-NEXT:    fmov w0, s0
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
-define i32 @orv_v32i32(<32 x i32>* %a) #0 {
+define i32 @orv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: orv_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    orv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
-define i32 @orv_v64i32(<64 x i32>* %a) #0 {
+define i32 @orv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: orv_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
-; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    orv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
 ; Nothing to do for single element vectors.
-define i64 @orv_v1i64(<1 x i64> %a) #0 {
+define i64 @orv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v1i64:
-; CHECK: fmov x0, d0
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
 ; Use SVE for 128-bit vectors
-define i64 @orv_v2i64(<2 x i64> %a) #0 {
+define i64 @orv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK: orv [[REDUCE:d[0-9]+]], [[PG]], z0.d
-; CHECK: fmov x0, [[REDUCE]]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    orv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
-define i64 @orv_v4i64(<4 x i64>* %a) #0 {
+define i64 @orv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; CHECK-NEXT: fmov x0, [[REDUCE]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    orv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
 define i64 @orv_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: orv_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
-; VBITS_EQ_256-DAG: orv [[REDUCE:d[0-9]+]], [[PG]], [[OR]].d
-; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: orv_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    orr z0.d, z1.d, z0.d
+; VBITS_GE_256-NEXT:    orv d0, p0, z0.d
+; VBITS_GE_256-NEXT:    fmov x0, d0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: orv_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    orv d0, p0, z0.d
+; VBITS_GE_512-NEXT:    fmov x0, d0
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
-define i64 @orv_v16i64(<16 x i64>* %a) #0 {
+define i64 @orv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: orv_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    orv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
-define i64 @orv_v32i64(<32 x i64>* %a) #0 {
+define i64 @orv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: orv_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
-; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    orv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %op)
   ret i64 %res

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
index 8e8500348be0..dba92869aed1 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
@@ -1,18 +1,7 @@
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -20,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; LD1B
 ;
 
-define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
+define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x1]
@@ -36,7 +25,7 @@ define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
+define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v4i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -54,21 +43,21 @@ define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
 }
 
 define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
-; VBITS_EQ_256-LABEL: masked_gather_v8i8:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #4
-; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
-; VBITS_EQ_256-NEXT:    ld1b { z1.d }, p0/z, [z1.d]
-; VBITS_EQ_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_EQ_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_EQ_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_EQ_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_EQ_256-NEXT:    uzp1 v0.8b, v1.8b, v0.8b
-; VBITS_EQ_256-NEXT:    str d0, [x0]
-; VBITS_EQ_256-NEXT:    ret
+; VBITS_GE_256-LABEL: masked_gather_v8i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
+; VBITS_GE_256-NEXT:    ld1b { z1.d }, p0/z, [z1.d]
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    uzp1 v0.8b, v1.8b, v0.8b
+; VBITS_GE_256-NEXT:    str d0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_gather_v8i8:
 ; VBITS_GE_512:       // %bb.0:
@@ -86,17 +75,17 @@ define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_gather_v16i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
-; VBITS_GE_1024-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_1024-NEXT:    str q0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
   %ptrs = load <16 x i8*>, <16 x i8*>* %b
   %vals = call <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
                                                                                        i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
@@ -104,18 +93,18 @@ define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_gather_v32i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
-; VBITS_GE_2048-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT:    uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_2048-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %ptrs = load <32 x i8*>, <32 x i8*>* %b
   %vals = call <32 x i8> @llvm.masked.gather.v32i8(<32 x i8*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
                                                                                        i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
@@ -129,7 +118,7 @@ define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
 ; LD1H
 ;
 
-define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
+define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x1]
@@ -145,7 +134,7 @@ define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
+define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -162,21 +151,21 @@ define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
 }
 
 define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
-; VBITS_EQ_256-LABEL: masked_gather_v8i16:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #4
-; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
-; VBITS_EQ_256-NEXT:    ld1h { z1.d }, p0/z, [z1.d]
-; VBITS_EQ_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_EQ_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_EQ_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_EQ_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_EQ_256-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_EQ_256-NEXT:    str q1, [x0]
-; VBITS_EQ_256-NEXT:    ret
+; VBITS_GE_256-LABEL: masked_gather_v8i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
+; VBITS_GE_256-NEXT:    ld1h { z1.d }, p0/z, [z1.d]
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    str q1, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_gather_v8i16:
 ; VBITS_GE_512:       // %bb.0:
@@ -193,17 +182,17 @@ define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_gather_v16i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %ptrs = load <16 x i16*>, <16 x i16*>* %b
   %vals = call <16 x i16> @llvm.masked.gather.v16i16(<16 x i16*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
                                                                                           i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i16> undef)
@@ -211,17 +200,17 @@ define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_gather_v32i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %ptrs = load <32 x i16*>, <32 x i16*>* %b
   %vals = call <32 x i16> @llvm.masked.gather.v32i16(<32 x i16*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
                                                                                           i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
@@ -235,7 +224,7 @@ define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
 ; LD1W
 ;
 
-define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
+define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x1]
@@ -250,7 +239,7 @@ define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
+define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -266,21 +255,21 @@ define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
 }
 
 define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
-; VBITS_EQ_256-LABEL: masked_gather_v8i32:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #4
-; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
-; VBITS_EQ_256-NEXT:    ld1w { z1.d }, p0/z, [z1.d]
-; VBITS_EQ_256-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_EQ_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_EQ_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
+; VBITS_GE_256-LABEL: masked_gather_v8i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
+; VBITS_GE_256-NEXT:    ld1w { z1.d }, p0/z, [z1.d]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_gather_v8i32:
 ; VBITS_GE_512:       // %bb.0:
@@ -297,16 +286,16 @@ define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_gather_v16i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %ptrs = load <16 x i32*>, <16 x i32*>* %b
   %vals = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
                                                                                           i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
@@ -314,16 +303,16 @@ define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_gather_v32i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %ptrs = load <32 x i32*>, <32 x i32*>* %b
   %vals = call <32 x i32> @llvm.masked.gather.v32i32(<32 x i32*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
                                                                                           i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
@@ -337,7 +326,7 @@ define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
 ; LD1D
 ;
 
-define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
+define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x1]
@@ -351,7 +340,7 @@ define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
+define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -366,17 +355,17 @@ define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
 }
 
 define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
-; VBITS_EQ_256-LABEL: masked_gather_v8i64:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #4
-; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
-; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [z1.d]
-; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
+; VBITS_GE_256-LABEL: masked_gather_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [z1.d]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_gather_v8i64:
 ; VBITS_GE_512:       // %bb.0:
@@ -391,14 +380,14 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_gather_v16i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %ptrs = load <16 x i64*>, <16 x i64*>* %b
   %vals = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
                                                                                           i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i64> undef)
@@ -406,14 +395,14 @@ define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_gather_v32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %ptrs = load <32 x i64*>, <32 x i64*>* %b
   %vals = call <32 x i64> @llvm.masked.gather.v32i64(<32 x i64*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
                                                                                           i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 0fe797e547b7..af9bf560afca 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -1,21 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -23,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; LD1B
 ;
 
-define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
+define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w8, [x0]
@@ -48,7 +34,7 @@ define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
+define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v4i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
@@ -74,7 +60,6 @@ define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
 }
 
 define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: masked_gather_v8i8:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ldr d0, [x0]
@@ -129,78 +114,23 @@ define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ldr q0, [x0]
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    cmeq v0.16b, v0.16b, #0
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    zip2 v1.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT:    zip1 v2.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    shl v1.4h, v1.4h, #8
-; VBITS_GE_256-NEXT:    shl v2.4h, v2.4h, #8
-; VBITS_GE_256-NEXT:    zip2 v3.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT:    zip1 v0.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT:    sshr v1.4h, v1.4h, #8
-; VBITS_GE_256-NEXT:    sshr v2.4h, v2.4h, #8
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    shl v3.4h, v3.4h, #8
-; VBITS_GE_256-NEXT:    shl v0.4h, v0.4h, #8
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    sshr v1.4h, v3.4h, #8
-; VBITS_GE_256-NEXT:    sshr v0.4h, v0.4h, #8
-; VBITS_GE_256-NEXT:    cmpne p2.d, p0/z, z2.d, #0
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    ld1b { z2.d }, p1/z, [z6.d]
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; VBITS_GE_256-NEXT:    ld1b { z3.d }, p2/z, [z7.d]
-; VBITS_GE_256-NEXT:    ld1b { z0.d }, p1/z, [z5.d]
-; VBITS_GE_256-NEXT:    ld1b { z1.d }, p0/z, [z4.d]
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    mov v3.d[1], v2.d[0]
-; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_GE_256-NEXT:    uzp1 v0.16b, v3.16b, v1.16b
-; VBITS_GE_256-NEXT:    str q0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ldr q0, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    cmeq v0.16b, v0.16b, #0
-; VBITS_GE_1024-NEXT:    sunpklo z0.h, z0.b
-; VBITS_GE_1024-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; VBITS_GE_1024-NEXT:    ld1b { z0.d }, p0/z, [z1.d]
-; VBITS_GE_1024-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_1024-NEXT:    str q0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmeq v0.16b, v0.16b, #0
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <16 x i8>, <16 x i8>* %a
   %ptrs = load <16 x i8*>, <16 x i8*>* %b
   %mask = icmp eq <16 x i8> %cval, zeroinitializer
@@ -209,199 +139,23 @@ define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_GE_256-NEXT:    .cfi_def_cfa_offset 16
-; VBITS_GE_256-NEXT:    mov x29, sp
-; VBITS_GE_256-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_GE_256-NEXT:    .cfi_offset w30, -8
-; VBITS_GE_256-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_256-NEXT:    sub x9, sp, #48
-; VBITS_GE_256-NEXT:    and sp, x9, #0xffffffffffffffe0
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    cmpeq p1.b, p0/z, z0.b, #0
-; VBITS_GE_256-NEXT:    mov z4.b, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    zip2 v2.8b, v4.8b, v0.8b
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    shl v3.4h, v2.4h, #8
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    sshr v5.4h, v3.4h, #8
-; VBITS_GE_256-NEXT:    mov x8, #20
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    ld1b { z5.d }, p2/z, [z7.d]
-; VBITS_GE_256-NEXT:    zip1 v7.8b, v4.8b, v0.8b
-; VBITS_GE_256-NEXT:    uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT:    shl v7.4h, v7.4h, #8
-; VBITS_GE_256-NEXT:    uzp1 z5.h, z5.h, z5.h
-; VBITS_GE_256-NEXT:    umov w8, v5.h[3]
-; VBITS_GE_256-NEXT:    umov w9, v5.h[2]
-; VBITS_GE_256-NEXT:    umov w10, v5.h[1]
-; VBITS_GE_256-NEXT:    sshr v7.4h, v7.4h, #8
-; VBITS_GE_256-NEXT:    umov w11, v5.h[0]
-; VBITS_GE_256-NEXT:    mov z5.d, z4.d
-; VBITS_GE_256-NEXT:    sunpklo z7.s, z7.h
-; VBITS_GE_256-NEXT:    ext z5.b, z5.b, z4.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z7.d, z7.s
-; VBITS_GE_256-NEXT:    strb w8, [sp, #7]
-; VBITS_GE_256-NEXT:    strb w9, [sp, #6]
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z7.d, #0
-; VBITS_GE_256-NEXT:    strb w10, [sp, #5]
-; VBITS_GE_256-NEXT:    strb w11, [sp, #4]
-; VBITS_GE_256-NEXT:    ld1b { z7.d }, p2/z, [z17.d]
-; VBITS_GE_256-NEXT:    zip2 v17.8b, v5.8b, v0.8b
-; VBITS_GE_256-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
-; VBITS_GE_256-NEXT:    uzp1 z7.s, z7.s, z7.s
-; VBITS_GE_256-NEXT:    shl v17.4h, v17.4h, #8
-; VBITS_GE_256-NEXT:    uzp1 z7.h, z7.h, z7.h
-; VBITS_GE_256-NEXT:    umov w8, v7.h[3]
-; VBITS_GE_256-NEXT:    umov w9, v7.h[2]
-; VBITS_GE_256-NEXT:    umov w10, v7.h[1]
-; VBITS_GE_256-NEXT:    sshr v17.4h, v17.4h, #8
-; VBITS_GE_256-NEXT:    umov w11, v7.h[0]
-; VBITS_GE_256-NEXT:    sunpklo z7.s, z17.h
-; VBITS_GE_256-NEXT:    sunpklo z7.d, z7.s
-; VBITS_GE_256-NEXT:    strb w8, [sp, #3]
-; VBITS_GE_256-NEXT:    strb w9, [sp, #2]
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z7.d, #0
-; VBITS_GE_256-NEXT:    strb w10, [sp, #1]
-; VBITS_GE_256-NEXT:    strb w11, [sp]
-; VBITS_GE_256-NEXT:    ld1b { z7.d }, p2/z, [z16.d]
-; VBITS_GE_256-NEXT:    zip1 v16.8b, v5.8b, v0.8b
-; VBITS_GE_256-NEXT:    uzp1 z7.s, z7.s, z7.s
-; VBITS_GE_256-NEXT:    shl v16.4h, v16.4h, #8
-; VBITS_GE_256-NEXT:    uzp1 z7.h, z7.h, z7.h
-; VBITS_GE_256-NEXT:    umov w8, v7.h[3]
-; VBITS_GE_256-NEXT:    umov w9, v7.h[2]
-; VBITS_GE_256-NEXT:    umov w10, v7.h[1]
-; VBITS_GE_256-NEXT:    sshr v16.4h, v16.4h, #8
-; VBITS_GE_256-NEXT:    umov w11, v7.h[0]
-; VBITS_GE_256-NEXT:    sunpklo z7.s, z16.h
-; VBITS_GE_256-NEXT:    sunpklo z7.d, z7.s
-; VBITS_GE_256-NEXT:    strb w8, [sp, #23]
-; VBITS_GE_256-NEXT:    strb w9, [sp, #22]
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z7.d, #0
-; VBITS_GE_256-NEXT:    strb w10, [sp, #21]
-; VBITS_GE_256-NEXT:    zip2 v7.8b, v4.8b, v0.8b
-; VBITS_GE_256-NEXT:    strb w11, [sp, #20]
-; VBITS_GE_256-NEXT:    zip1 v4.8b, v4.8b, v0.8b
-; VBITS_GE_256-NEXT:    ld1b { z6.d }, p2/z, [z6.d]
-; VBITS_GE_256-NEXT:    shl v7.4h, v7.4h, #8
-; VBITS_GE_256-NEXT:    shl v4.4h, v4.4h, #8
-; VBITS_GE_256-NEXT:    uzp1 z6.s, z6.s, z6.s
-; VBITS_GE_256-NEXT:    sshr v7.4h, v7.4h, #8
-; VBITS_GE_256-NEXT:    uzp1 z6.h, z6.h, z6.h
-; VBITS_GE_256-NEXT:    sshr v4.4h, v4.4h, #8
-; VBITS_GE_256-NEXT:    umov w8, v6.h[3]
-; VBITS_GE_256-NEXT:    umov w9, v6.h[2]
-; VBITS_GE_256-NEXT:    umov w10, v6.h[1]
-; VBITS_GE_256-NEXT:    umov w11, v6.h[0]
-; VBITS_GE_256-NEXT:    sunpklo z6.s, z7.h
-; VBITS_GE_256-NEXT:    sunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z6.s
-; VBITS_GE_256-NEXT:    sunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT:    strb w8, [sp, #19]
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT:    strb w9, [sp, #18]
-; VBITS_GE_256-NEXT:    strb w10, [sp, #17]
-; VBITS_GE_256-NEXT:    strb w11, [sp, #16]
-; VBITS_GE_256-NEXT:    ld1b { z3.d }, p2/z, [z3.d]
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z4.d, #0
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    umov w8, v3.h[3]
-; VBITS_GE_256-NEXT:    umov w9, v3.h[2]
-; VBITS_GE_256-NEXT:    umov w10, v3.h[1]
-; VBITS_GE_256-NEXT:    umov w11, v3.h[0]
-; VBITS_GE_256-NEXT:    ext v3.16b, v5.16b, v5.16b, #8
-; VBITS_GE_256-NEXT:    strb w8, [sp, #15]
-; VBITS_GE_256-NEXT:    strb w9, [sp, #14]
-; VBITS_GE_256-NEXT:    strb w10, [sp, #13]
-; VBITS_GE_256-NEXT:    zip2 v4.8b, v3.8b, v0.8b
-; VBITS_GE_256-NEXT:    strb w11, [sp, #12]
-; VBITS_GE_256-NEXT:    ld1b { z2.d }, p2/z, [z2.d]
-; VBITS_GE_256-NEXT:    shl v4.4h, v4.4h, #8
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    sshr v4.4h, v4.4h, #8
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    umov w8, v2.h[3]
-; VBITS_GE_256-NEXT:    umov w9, v2.h[2]
-; VBITS_GE_256-NEXT:    umov w10, v2.h[1]
-; VBITS_GE_256-NEXT:    umov w11, v2.h[0]
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z4.h
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    strb w8, [sp, #11]
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT:    strb w9, [sp, #10]
-; VBITS_GE_256-NEXT:    zip1 v2.8b, v3.8b, v0.8b
-; VBITS_GE_256-NEXT:    strb w10, [sp, #9]
-; VBITS_GE_256-NEXT:    strb w11, [sp, #8]
-; VBITS_GE_256-NEXT:    ld1b { z1.d }, p2/z, [z1.d]
-; VBITS_GE_256-NEXT:    shl v2.4h, v2.4h, #8
-; VBITS_GE_256-NEXT:    sshr v2.4h, v2.4h, #8
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    umov w8, v1.h[3]
-; VBITS_GE_256-NEXT:    umov w9, v1.h[2]
-; VBITS_GE_256-NEXT:    umov w10, v1.h[1]
-; VBITS_GE_256-NEXT:    umov w11, v1.h[0]
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z2.h
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    strb w8, [sp, #31]
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT:    strb w9, [sp, #30]
-; VBITS_GE_256-NEXT:    strb w10, [sp, #29]
-; VBITS_GE_256-NEXT:    strb w11, [sp, #28]
-; VBITS_GE_256-NEXT:    ld1b { z0.d }, p1/z, [z0.d]
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    umov w8, v0.h[3]
-; VBITS_GE_256-NEXT:    umov w9, v0.h[2]
-; VBITS_GE_256-NEXT:    umov w10, v0.h[1]
-; VBITS_GE_256-NEXT:    umov w11, v0.h[0]
-; VBITS_GE_256-NEXT:    strb w8, [sp, #27]
-; VBITS_GE_256-NEXT:    strb w9, [sp, #26]
-; VBITS_GE_256-NEXT:    strb w10, [sp, #25]
-; VBITS_GE_256-NEXT:    strb w11, [sp, #24]
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [sp]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_256-NEXT:    mov sp, x29
-; VBITS_GE_256-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.b, p0/z, z0.b, #0
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1b { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT:    uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_2048-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, #0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1b { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <32 x i8>, <32 x i8>* %a
   %ptrs = load <32 x i8*>, <32 x i8*>* %b
   %mask = icmp eq <32 x i8> %cval, zeroinitializer
@@ -414,7 +168,7 @@ define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
 ; LD1H
 ;
 
-define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
+define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
@@ -439,7 +193,7 @@ define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
+define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -463,7 +217,6 @@ define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
 }
 
 define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: masked_gather_v8i16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ldr q0, [x0]
@@ -511,69 +264,21 @@ define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    cmpeq p2.h, p0/z, z0.h, #0
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov z2.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    sunpklo z4.s, z2.h
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z4.d, #0
-; VBITS_GE_256-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    ld1h { z6.d }, p2/z, [z6.d]
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT:    sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT:    sunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT:    ld1h { z3.d }, p2/z, [z3.d]
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z4.s
-; VBITS_GE_256-NEXT:    ld1h { z0.d }, p2/z, [z0.d]
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z6.s, z6.s
-; VBITS_GE_256-NEXT:    ld1h { z1.d }, p1/z, [z1.d]
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    mov v2.d[1], v3.d[0]
-; VBITS_GE_256-NEXT:    mov v0.d[1], v1.d[0]
-; VBITS_GE_256-NEXT:    splice z2.h, p1, z2.h, z0.h
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT:    ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
-; VBITS_GE_1024-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT:    ld1h { z0.d }, p1/z, [z1.d]
-; VBITS_GE_1024-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ptrue p1.d, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1h { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <16 x i16>, <16 x i16>* %a
   %ptrs = load <16 x i16*>, <16 x i16*>* %b
   %mask = icmp eq <16 x i16> %cval, zeroinitializer
@@ -582,111 +287,21 @@ define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x11, #4
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    cmpeq p2.h, p0/z, z3.h, #0
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov z3.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    sunpklo z18.s, z3.h
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ext v5.16b, v3.16b, v3.16b, #8
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z18.d, z18.s
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z18.d, #0
-; VBITS_GE_256-NEXT:    ext v18.16b, v3.16b, v3.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    sunpklo z18.s, z18.h
-; VBITS_GE_256-NEXT:    ld1h { z17.d }, p2/z, [z17.d]
-; VBITS_GE_256-NEXT:    cmpeq p2.h, p0/z, z4.h, #0
-; VBITS_GE_256-NEXT:    sunpklo z18.d, z18.s
-; VBITS_GE_256-NEXT:    ld1h { z4.d }, p3/z, [z16.d]
-; VBITS_GE_256-NEXT:    mov z16.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z18.d, #0
-; VBITS_GE_256-NEXT:    ld1h { z3.d }, p2/z, [z7.d]
-; VBITS_GE_256-NEXT:    ld1h { z6.d }, p3/z, [z6.d]
-; VBITS_GE_256-NEXT:    uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT:    uzp1 z7.h, z17.h, z17.h
-; VBITS_GE_256-NEXT:    uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z6.s, z6.s, z6.s
-; VBITS_GE_256-NEXT:    mov v7.d[1], v4.d[0]
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    ext v4.16b, v16.16b, v16.16b, #8
-; VBITS_GE_256-NEXT:    uzp1 z6.h, z6.h, z6.h
-; VBITS_GE_256-NEXT:    mov v3.d[1], v6.d[0]
-; VBITS_GE_256-NEXT:    sunpklo z6.s, z16.h
-; VBITS_GE_256-NEXT:    ext z16.b, z16.b, z16.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z6.s
-; VBITS_GE_256-NEXT:    ext v17.16b, v16.16b, v16.16b, #8
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT:    sunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT:    sunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z4.d, #0
-; VBITS_GE_256-NEXT:    ld1h { z4.d }, p2/z, [z5.d]
-; VBITS_GE_256-NEXT:    sunpklo z5.s, z16.h
-; VBITS_GE_256-NEXT:    sunpklo z6.s, z17.h
-; VBITS_GE_256-NEXT:    sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z6.s
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT:    ld1h { z2.d }, p3/z, [z2.d]
-; VBITS_GE_256-NEXT:    ld1h { z1.d }, p2/z, [z1.d]
-; VBITS_GE_256-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT:    uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
-; VBITS_GE_256-NEXT:    splice z7.h, p1, z7.h, z3.h
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    mov v4.d[1], v2.d[0]
-; VBITS_GE_256-NEXT:    st1h { z7.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_GE_256-NEXT:    splice z4.h, p1, z4.h, z1.h
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1h { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1h { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <32 x i16>, <32 x i16>* %a
   %ptrs = load <32 x i16*>, <32 x i16*>* %b
   %mask = icmp eq <32 x i16> %cval, zeroinitializer
@@ -699,7 +314,7 @@ define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
 ; LD1W
 ;
 
-define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
+define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -720,7 +335,7 @@ define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
+define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -742,7 +357,6 @@ define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
 }
 
 define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: masked_gather_v8i32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
@@ -787,61 +401,19 @@ define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    mov x10, #12
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z0.s, #0
-; VBITS_GE_256-NEXT:    cmpeq p3.s, p0/z, z1.s, #0
-; VBITS_GE_256-NEXT:    mov z0.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ld1w { z4.d }, p2/z, [z4.d]
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z0.d, #0
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p3/z, [z3.d]
-; VBITS_GE_256-NEXT:    ld1w { z1.d }, p2/z, [z5.d]
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p1/z, [z2.d]
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z4.s, z4.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    splice z3.s, p1, z3.s, z0.s
-; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z2.s
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_1024-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
-; VBITS_GE_1024-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ptrue p1.d, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <16 x i32>, <16 x i32>* %a
   %ptrs = load <16 x i32*>, <16 x i32*>* %b
   %mask = icmp eq <16 x i32> %cval, zeroinitializer
@@ -850,97 +422,19 @@ define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x14, #28
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x11, #4
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x12, #20
-; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z0.s, #0
-; VBITS_GE_256-NEXT:    mov x13, #12
-; VBITS_GE_256-NEXT:    mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    cmpeq p4.s, p0/z, z2.s, #0
-; VBITS_GE_256-NEXT:    ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p2/z, [z18.d]
-; VBITS_GE_256-NEXT:    ld1w { z17.d }, p3/z, [z17.d]
-; VBITS_GE_256-NEXT:    cmpeq p3.s, p0/z, z1.s, #0
-; VBITS_GE_256-NEXT:    mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z3.s, #0
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT:    punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT:    ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    ld1w { z16.d }, p3/z, [z16.d]
-; VBITS_GE_256-NEXT:    ld1w { z1.d }, p5/z, [z6.d]
-; VBITS_GE_256-NEXT:    and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ld1w { z7.d }, p4/z, [z7.d]
-; VBITS_GE_256-NEXT:    cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z5.d }, p4/z, [z5.d]
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p2/z, [z0.d]
-; VBITS_GE_256-NEXT:    ld1w { z3.d }, p1/z, [z4.d]
-; VBITS_GE_256-NEXT:    ptrue p3.s, vl4
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT:    splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT:    uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT:    splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT:    splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <32 x i32>, <32 x i32>* %a
   %ptrs = load <32 x i32*>, <32 x i32*>* %b
   %mask = icmp eq <32 x i32> %cval, zeroinitializer
@@ -954,7 +448,7 @@ define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
 ;
 
 ; Scalarize 1 x i64 gathers
-define void @masked_gather_v1i64(<1 x i64>* %a, <1 x i64*>* %b) #0 {
+define void @masked_gather_v1i64(<1 x i64>* %a, <1 x i64*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -976,7 +470,7 @@ define void @masked_gather_v1i64(<1 x i64>* %a, <1 x i64*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
+define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -995,7 +489,7 @@ define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
+define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -1014,7 +508,6 @@ define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
 }
 
 define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: masked_gather_v8i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #4
@@ -1040,7 +533,6 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
 ; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
-
   %cval = load <8 x i64>, <8 x i64>* %a
   %ptrs = load <8 x i64*>, <8 x i64*>* %b
   %mask = icmp eq <8 x i64> %cval, zeroinitializer
@@ -1049,44 +541,16 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    mov x10, #12
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_256-NEXT:    cmpeq p3.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z2.d, #0
-; VBITS_GE_256-NEXT:    cmpeq p4.d, p0/z, z3.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p3/z, [z4.d]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [z6.d]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [z5.d]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p4/z, [z7.d]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <16 x i64>, <16 x i64>* %a
   %ptrs = load <16 x i64*>, <16 x i64*>* %b
   %mask = icmp eq <16 x i64> %cval, zeroinitializer
@@ -1095,68 +559,16 @@ define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #12
-; VBITS_GE_256-NEXT:    mov x11, #16
-; VBITS_GE_256-NEXT:    mov x12, #20
-; VBITS_GE_256-NEXT:    mov x13, #24
-; VBITS_GE_256-NEXT:    mov x14, #28
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z19.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z21.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z22.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z20.d }, p0/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z0.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p0/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p0/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z18.d }, p0/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z23.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p2/z, [z19.d]
-; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z6.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [z21.d]
-; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z2.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p2/z, [z22.d]
-; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z3.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p2/z, [z20.d]
-; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z4.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p2/z, [z18.d]
-; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z5.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p2/z, [z17.d]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [z16.d]
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z7.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [z23.d]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <32 x i64>, <32 x i64>* %a
   %ptrs = load <32 x i64*>, <32 x i64*>* %b
   %mask = icmp eq <32 x i64> %cval, zeroinitializer
@@ -1169,7 +581,7 @@ define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 {
 ; LD1H (float)
 ;
 
-define void @masked_gather_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 {
+define void @masked_gather_v2f16(<2 x half>* %a, <2 x half*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s1, [x0]
@@ -1205,7 +617,7 @@ define void @masked_gather_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v4f16(<4 x half>* %a, <4 x half*>* %b) #0 {
+define void @masked_gather_v4f16(<4 x half>* %a, <4 x half*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -1276,69 +688,21 @@ define void @masked_gather_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z0.h, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov z2.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    sunpklo z4.s, z2.h
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z4.d, #0
-; VBITS_GE_256-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_256-NEXT:    ld1h { z6.d }, p2/z, [z6.d]
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT:    sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT:    sunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT:    ld1h { z3.d }, p2/z, [z3.d]
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z4.s
-; VBITS_GE_256-NEXT:    ld1h { z0.d }, p2/z, [z0.d]
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z6.s, z6.s
-; VBITS_GE_256-NEXT:    ld1h { z1.d }, p1/z, [z1.d]
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    mov v2.d[1], v3.d[0]
-; VBITS_GE_256-NEXT:    mov v0.d[1], v1.d[0]
-; VBITS_GE_256-NEXT:    splice z2.h, p1, z2.h, z0.h
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT:    ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
-; VBITS_GE_1024-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT:    ld1h { z0.d }, p1/z, [z1.d]
-; VBITS_GE_1024-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ptrue p1.d, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1h { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <16 x half>, <16 x half>* %a
   %ptrs = load <16 x half*>, <16 x half*>* %b
   %mask = fcmp oeq <16 x half> %cval, zeroinitializer
@@ -1347,111 +711,21 @@ define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x11, #4
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z3.h, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov z3.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    sunpklo z18.s, z3.h
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ext v5.16b, v3.16b, v3.16b, #8
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z18.d, z18.s
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z18.d, #0
-; VBITS_GE_256-NEXT:    ext v18.16b, v3.16b, v3.16b, #8
-; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    sunpklo z5.s, z5.h
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    sunpklo z18.s, z18.h
-; VBITS_GE_256-NEXT:    ld1h { z17.d }, p2/z, [z17.d]
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z4.h, #0.0
-; VBITS_GE_256-NEXT:    sunpklo z18.d, z18.s
-; VBITS_GE_256-NEXT:    ld1h { z4.d }, p3/z, [z16.d]
-; VBITS_GE_256-NEXT:    mov z16.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z18.d, #0
-; VBITS_GE_256-NEXT:    ld1h { z3.d }, p2/z, [z7.d]
-; VBITS_GE_256-NEXT:    ld1h { z6.d }, p3/z, [z6.d]
-; VBITS_GE_256-NEXT:    uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT:    uzp1 z7.h, z17.h, z17.h
-; VBITS_GE_256-NEXT:    uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z6.s, z6.s, z6.s
-; VBITS_GE_256-NEXT:    mov v7.d[1], v4.d[0]
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    ext v4.16b, v16.16b, v16.16b, #8
-; VBITS_GE_256-NEXT:    uzp1 z6.h, z6.h, z6.h
-; VBITS_GE_256-NEXT:    mov v3.d[1], v6.d[0]
-; VBITS_GE_256-NEXT:    sunpklo z6.s, z16.h
-; VBITS_GE_256-NEXT:    ext z16.b, z16.b, z16.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z6.s
-; VBITS_GE_256-NEXT:    ext v17.16b, v16.16b, v16.16b, #8
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT:    sunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT:    sunpklo z4.d, z4.s
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z4.d, #0
-; VBITS_GE_256-NEXT:    ld1h { z4.d }, p2/z, [z5.d]
-; VBITS_GE_256-NEXT:    sunpklo z5.s, z16.h
-; VBITS_GE_256-NEXT:    sunpklo z6.s, z17.h
-; VBITS_GE_256-NEXT:    sunpklo z5.d, z5.s
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z6.s
-; VBITS_GE_256-NEXT:    cmpne p2.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT:    ld1h { z2.d }, p3/z, [z2.d]
-; VBITS_GE_256-NEXT:    ld1h { z1.d }, p2/z, [z1.d]
-; VBITS_GE_256-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT:    uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
-; VBITS_GE_256-NEXT:    splice z7.h, p1, z7.h, z3.h
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    mov v4.d[1], v2.d[0]
-; VBITS_GE_256-NEXT:    st1h { z7.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_GE_256-NEXT:    splice z4.h, p1, z4.h, z1.h
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1h { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1h { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <32 x half>, <32 x half>* %a
   %ptrs = load <32 x half*>, <32 x half*>* %b
   %mask = fcmp oeq <32 x half> %cval, zeroinitializer
@@ -1464,7 +738,7 @@ define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 {
 ; LD1W (float)
 ;
 
-define void @masked_gather_v2f32(<2 x float>* %a, <2 x float*>* %b) #0 {
+define void @masked_gather_v2f32(<2 x float>* %a, <2 x float*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -1485,7 +759,7 @@ define void @masked_gather_v2f32(<2 x float>* %a, <2 x float*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v4f32(<4 x float>* %a, <4 x float*>* %b) #0 {
+define void @masked_gather_v4f32(<4 x float>* %a, <4 x float*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -1551,61 +825,19 @@ define void @masked_gather_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    mov x10, #12
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT:    mov z0.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ld1w { z4.d }, p2/z, [z4.d]
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z0.d, #0
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p3/z, [z3.d]
-; VBITS_GE_256-NEXT:    ld1w { z1.d }, p2/z, [z5.d]
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p1/z, [z2.d]
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z4.s, z4.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    splice z3.s, p1, z3.s, z0.s
-; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z2.s
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_1024-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
-; VBITS_GE_1024-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ptrue p1.d, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <16 x float>, <16 x float>* %a
   %ptrs = load <16 x float*>, <16 x float*>* %b
   %mask = fcmp oeq <16 x float> %cval, zeroinitializer
@@ -1614,97 +846,19 @@ define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x14, #28
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x11, #4
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x12, #20
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT:    mov x13, #12
-; VBITS_GE_256-NEXT:    mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p4.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT:    ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p2/z, [z18.d]
-; VBITS_GE_256-NEXT:    ld1w { z17.d }, p3/z, [z17.d]
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT:    mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT:    punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT:    ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    ld1w { z16.d }, p3/z, [z16.d]
-; VBITS_GE_256-NEXT:    ld1w { z1.d }, p5/z, [z6.d]
-; VBITS_GE_256-NEXT:    and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ld1w { z7.d }, p4/z, [z7.d]
-; VBITS_GE_256-NEXT:    cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z5.d }, p4/z, [z5.d]
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p2/z, [z0.d]
-; VBITS_GE_256-NEXT:    ld1w { z3.d }, p1/z, [z4.d]
-; VBITS_GE_256-NEXT:    ptrue p3.s, vl4
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT:    splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT:    uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT:    splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT:    splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <32 x float>, <32 x float>* %a
   %ptrs = load <32 x float*>, <32 x float*>* %b
   %mask = fcmp oeq <32 x float> %cval, zeroinitializer
@@ -1718,7 +872,7 @@ define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 {
 ;
 
 ; Scalarize 1 x double gathers
-define void @masked_gather_v1f64(<1 x double>* %a, <1 x double*>* %b) #0 {
+define void @masked_gather_v1f64(<1 x double>* %a, <1 x double*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -1740,7 +894,7 @@ define void @masked_gather_v1f64(<1 x double>* %a, <1 x double*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v2f64(<2 x double>* %a, <2 x double*>* %b) #0 {
+define void @masked_gather_v2f64(<2 x double>* %a, <2 x double*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -1759,7 +913,7 @@ define void @masked_gather_v2f64(<2 x double>* %a, <2 x double*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 {
+define void @masked_gather_v4f64(<4 x double>* %a, <4 x double*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -1811,44 +965,16 @@ define void @masked_gather_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    mov x10, #12
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p3.d, p0/z, z1.d, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z2.d, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p4.d, p0/z, z3.d, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p3/z, [z4.d]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [z6.d]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [z5.d]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p4/z, [z7.d]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_v16f64(<16 x double>* %a, <16 x double*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <16 x double>, <16 x double>* %a
   %ptrs = load <16 x double*>, <16 x double*>* %b
   %mask = fcmp oeq <16 x double> %cval, zeroinitializer
@@ -1857,68 +983,16 @@ define void @masked_gather_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #12
-; VBITS_GE_256-NEXT:    mov x11, #16
-; VBITS_GE_256-NEXT:    mov x12, #20
-; VBITS_GE_256-NEXT:    mov x13, #24
-; VBITS_GE_256-NEXT:    mov x14, #28
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z19.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z21.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z22.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z20.d }, p0/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z0.d, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p0/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p0/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z18.d }, p0/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z23.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p2/z, [z19.d]
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z1.d, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z6.d, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [z21.d]
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z2.d, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p2/z, [z22.d]
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z3.d, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p2/z, [z20.d]
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z4.d, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p2/z, [z18.d]
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z5.d, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p2/z, [z17.d]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [z16.d]
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z7.d, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [z23.d]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %cval = load <32 x double>, <32 x double>* %a
   %ptrs = load <32 x double*>, <32 x double*>* %b
   %mask = fcmp oeq <32 x double> %cval, zeroinitializer
@@ -1930,61 +1004,19 @@ define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
 ; The above tests test the types, the below tests check that the addressing
 ; modes still function
 
-define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_32b_scaled_sext_f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p1/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p1/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p1/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z0.h, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p3.h, p0/z, z1.h, #0.0
-; VBITS_GE_256-NEXT:    mov z0.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    mov z1.h, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    ld1h { z4.s }, p2/z, [x2, z4.s, sxtw #1]
-; VBITS_GE_256-NEXT:    cmpne p2.s, p1/z, z0.s, #0
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ld1h { z0.s }, p2/z, [x2, z3.s, sxtw #1]
-; VBITS_GE_256-NEXT:    punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
-; VBITS_GE_256-NEXT:    ld1h { z1.s }, p2/z, [x2, z5.s, sxtw #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.s }, p1/z, [x2, z2.s, sxtw #1]
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z4.h, z4.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
-; VBITS_GE_256-NEXT:    splice z3.h, p1, z3.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z2.h
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_sext_f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    ptrue p1.s, vl32
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
-; VBITS_GE_1024-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1]
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_32b_scaled_sext_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %cvals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
   %ext = sext <32 x i32> %idxs to <32 x i64>
@@ -1995,44 +1027,16 @@ define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
   ret void
 }
 
-define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_32b_scaled_sext_f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p4.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p3/z, [x2, z4.s, sxtw #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p2/z, [x2, z6.s, sxtw #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p1/z, [x2, z5.s, sxtw #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p4/z, [x2, z7.s, sxtw #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_sext_f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2]
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_32b_scaled_sext_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
   %ext = sext <32 x i32> %idxs to <32 x i64>
@@ -2043,77 +1047,16 @@ define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b,
   ret void
 }
 
-define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_32b_scaled_sext_f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    mov x12, #24
-; VBITS_GE_256-NEXT:    mov x13, #28
-; VBITS_GE_256-NEXT:    mov x14, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z18.s }, p1/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z17.s }, p1/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z16.s }, p1/z, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z19.s }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z0.d, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z5.d, #0.0
-; VBITS_GE_256-NEXT:    sunpklo z22.d, z18.s
-; VBITS_GE_256-NEXT:    ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z18.d, z18.s
-; VBITS_GE_256-NEXT:    sunpklo z21.d, z17.s
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p2/z, [x2, z18.d, lsl #3]
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z1.d, #0.0
-; VBITS_GE_256-NEXT:    ext z17.b, z17.b, z17.b, #16
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [x2, z22.d, lsl #3]
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z2.d, #0.0
-; VBITS_GE_256-NEXT:    sunpklo z20.d, z16.s
-; VBITS_GE_256-NEXT:    ext z16.b, z16.b, z16.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z17.d, z17.s
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p2/z, [x2, z21.d, lsl #3]
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z3.d, #0.0
-; VBITS_GE_256-NEXT:    sunpklo z16.d, z16.s
-; VBITS_GE_256-NEXT:    sunpklo z23.d, z19.s
-; VBITS_GE_256-NEXT:    ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p2/z, [x2, z17.d, lsl #3]
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z4.d, #0.0
-; VBITS_GE_256-NEXT:    sunpklo z19.d, z19.s
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p2/z, [x2, z20.d, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p1/z, [x2, z16.d, lsl #3]
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z6.d, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z7.d, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p2/z, [x2, z23.d, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [x2, z19.d, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p1/z, [x2, z1.d, lsl #3]
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_32b_scaled_sext_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1sw { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x2, z1.d, lsl #3]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %cvals = load <32 x double>, <32 x double>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
   %ext = sext <32 x i32> %idxs to <32 x i64>
@@ -2124,61 +1067,19 @@ define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b
   ret void
 }
 
-define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_32b_scaled_zext:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p1/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p1/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p1/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z0.h, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p3.h, p0/z, z1.h, #0.0
-; VBITS_GE_256-NEXT:    mov z0.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    mov z1.h, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    ld1h { z4.s }, p2/z, [x2, z4.s, uxtw #1]
-; VBITS_GE_256-NEXT:    cmpne p2.s, p1/z, z0.s, #0
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ld1h { z0.s }, p2/z, [x2, z3.s, uxtw #1]
-; VBITS_GE_256-NEXT:    punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
-; VBITS_GE_256-NEXT:    ld1h { z1.s }, p2/z, [x2, z5.s, uxtw #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.s }, p1/z, [x2, z2.s, uxtw #1]
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z4.h, z4.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
-; VBITS_GE_256-NEXT:    splice z3.h, p1, z3.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z2.h
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_zext:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    ptrue p1.s, vl32
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
-; VBITS_GE_1024-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1]
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_32b_scaled_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %cvals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
   %ext = zext <32 x i32> %idxs to <32 x i64>
@@ -2189,61 +1090,19 @@ define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half
   ret void
 }
 
-define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_32b_unscaled_sext:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p1/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p1/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p1/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z0.h, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p3.h, p0/z, z1.h, #0.0
-; VBITS_GE_256-NEXT:    mov z0.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    mov z1.h, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    ld1h { z4.s }, p2/z, [x2, z4.s, sxtw]
-; VBITS_GE_256-NEXT:    cmpne p2.s, p1/z, z0.s, #0
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ld1h { z0.s }, p2/z, [x2, z3.s, sxtw]
-; VBITS_GE_256-NEXT:    punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
-; VBITS_GE_256-NEXT:    ld1h { z1.s }, p2/z, [x2, z5.s, sxtw]
-; VBITS_GE_256-NEXT:    ld1h { z2.s }, p1/z, [x2, z2.s, sxtw]
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z4.h, z4.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
-; VBITS_GE_256-NEXT:    splice z3.h, p1, z3.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z2.h
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_32b_unscaled_sext:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    ptrue p1.s, vl32
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
-; VBITS_GE_1024-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, sxtw]
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_32b_unscaled_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, sxtw]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %cvals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
   %ext = sext <32 x i32> %idxs to <32 x i64>
@@ -2255,61 +1114,19 @@ define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8
   ret void
 }
 
-define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_32b_unscaled_zext:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p1/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p1/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p1/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z0.h, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p3.h, p0/z, z1.h, #0.0
-; VBITS_GE_256-NEXT:    mov z0.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    mov z1.h, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    ld1h { z4.s }, p2/z, [x2, z4.s, uxtw]
-; VBITS_GE_256-NEXT:    cmpne p2.s, p1/z, z0.s, #0
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ld1h { z0.s }, p2/z, [x2, z3.s, uxtw]
-; VBITS_GE_256-NEXT:    punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
-; VBITS_GE_256-NEXT:    ld1h { z1.s }, p2/z, [x2, z5.s, uxtw]
-; VBITS_GE_256-NEXT:    ld1h { z2.s }, p1/z, [x2, z2.s, uxtw]
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z4.h, z4.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
-; VBITS_GE_256-NEXT:    splice z3.h, p1, z3.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z2.h
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_gather_32b_unscaled_zext:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    ptrue p1.s, vl32
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
-; VBITS_GE_1024-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_1024-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, uxtw]
-; VBITS_GE_1024-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_gather_32b_unscaled_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, uxtw]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %cvals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
   %ext = zext <32 x i32> %idxs to <32 x i64>
@@ -2321,97 +1138,19 @@ define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8
   ret void
 }
 
-define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_64b_scaled:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x14, #28
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x11, #4
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x12, #20
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT:    mov x13, #12
-; VBITS_GE_256-NEXT:    mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p4.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT:    ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p2/z, [x2, z18.d, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z17.d }, p3/z, [x2, z17.d, lsl #2]
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT:    mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT:    punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT:    ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    ld1w { z16.d }, p3/z, [x2, z16.d, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.d }, p5/z, [x2, z6.d, lsl #2]
-; VBITS_GE_256-NEXT:    and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ld1w { z7.d }, p4/z, [x2, z7.d, lsl #2]
-; VBITS_GE_256-NEXT:    cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z5.d }, p4/z, [x2, z5.d, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p2/z, [x2, z0.d, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.d }, p1/z, [x2, z4.d, lsl #2]
-; VBITS_GE_256-NEXT:    ptrue p3.s, vl4
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT:    splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT:    uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT:    splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT:    splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_64b_scaled:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_64b_scaled:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
   %idxs = load <32 x i64>, <32 x i64>* %b
   %ptrs = getelementptr float, float* %base, <32 x i64> %idxs
@@ -2421,97 +1160,19 @@ define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %
   ret void
 }
 
-define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) #0 {
-; VBITS_GE_256-LABEL: masked_gather_64b_unscaled:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x14, #28
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x11, #4
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x12, #20
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT:    mov x13, #12
-; VBITS_GE_256-NEXT:    mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p4.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT:    ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p2/z, [x2, z18.d]
-; VBITS_GE_256-NEXT:    ld1w { z17.d }, p3/z, [x2, z17.d]
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT:    mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT:    punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT:    ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    ld1w { z16.d }, p3/z, [x2, z16.d]
-; VBITS_GE_256-NEXT:    ld1w { z1.d }, p5/z, [x2, z6.d]
-; VBITS_GE_256-NEXT:    and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ld1w { z7.d }, p4/z, [x2, z7.d]
-; VBITS_GE_256-NEXT:    cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z5.d }, p4/z, [x2, z5.d]
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p2/z, [x2, z0.d]
-; VBITS_GE_256-NEXT:    ld1w { z3.d }, p1/z, [x2, z4.d]
-; VBITS_GE_256-NEXT:    ptrue p3.s, vl4
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT:    splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT:    uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT:    splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT:    splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_64b_unscaled:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p1/z, [x2, z1.d]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_64b_unscaled:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [x2, z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
   %idxs = load <32 x i64>, <32 x i64>* %b
   %byte_ptrs = getelementptr i8, i8* %base, <32 x i64> %idxs
@@ -2522,97 +1183,19 @@ define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %b
   ret void
 }
 
-define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) #0 {
-; VBITS_GE_256-LABEL: masked_gather_vec_plus_reg:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x14, #28
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x11, #4
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x12, #20
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT:    mov x13, #12
-; VBITS_GE_256-NEXT:    mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p4.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT:    ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p2/z, [x2, z18.d]
-; VBITS_GE_256-NEXT:    ld1w { z17.d }, p3/z, [x2, z17.d]
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT:    mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT:    punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT:    ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    ld1w { z16.d }, p3/z, [x2, z16.d]
-; VBITS_GE_256-NEXT:    ld1w { z1.d }, p5/z, [x2, z6.d]
-; VBITS_GE_256-NEXT:    and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ld1w { z7.d }, p4/z, [x2, z7.d]
-; VBITS_GE_256-NEXT:    cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z5.d }, p4/z, [x2, z5.d]
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p2/z, [x2, z0.d]
-; VBITS_GE_256-NEXT:    ld1w { z3.d }, p1/z, [x2, z4.d]
-; VBITS_GE_256-NEXT:    ptrue p3.s, vl4
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT:    splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT:    uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT:    splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT:    splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_vec_plus_reg:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p1/z, [x2, z1.d]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_vec_plus_reg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [x2, z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
   %bases = load <32 x i8*>, <32 x i8*>* %b
   %byte_ptrs = getelementptr i8, <32 x i8*> %bases, i64 %off
@@ -2623,97 +1206,19 @@ define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %o
   ret void
 }
 
-define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_vec_plus_imm:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x14, #28
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x11, #4
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x12, #20
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT:    mov x13, #12
-; VBITS_GE_256-NEXT:    mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p4.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT:    ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p2/z, [z18.d, #4]
-; VBITS_GE_256-NEXT:    ld1w { z17.d }, p3/z, [z17.d, #4]
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT:    mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT:    punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT:    ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    ld1w { z16.d }, p3/z, [z16.d, #4]
-; VBITS_GE_256-NEXT:    ld1w { z1.d }, p5/z, [z6.d, #4]
-; VBITS_GE_256-NEXT:    and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ld1w { z7.d }, p4/z, [z7.d, #4]
-; VBITS_GE_256-NEXT:    cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z5.d }, p4/z, [z5.d, #4]
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p2/z, [z0.d, #4]
-; VBITS_GE_256-NEXT:    ld1w { z3.d }, p1/z, [z4.d, #4]
-; VBITS_GE_256-NEXT:    ptrue p3.s, vl4
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT:    splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT:    uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT:    splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT:    splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_vec_plus_imm:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p1/z, [z1.d, #4]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_vec_plus_imm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z1.d, #4]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
   %bases = load <32 x i8*>, <32 x i8*>* %b
   %byte_ptrs = getelementptr i8, <32 x i8*> %bases, i64 4
@@ -2724,115 +1229,21 @@ define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
   ret void
 }
 
-define void @masked_gather_passthru(<32 x float>* %a, <32 x float*>* %b, <32 x float>* %c) #0 {
-; VBITS_GE_256-LABEL: masked_gather_passthru:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    mov x11, #4
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x12, #20
-; VBITS_GE_256-NEXT:    ld1w { z17.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z20.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z23.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x13, #12
-; VBITS_GE_256-NEXT:    mov x14, #28
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z4.s, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    punpklo p3.h, p2.b
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT:    ld1d { z19.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z18.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z21.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z22.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x2, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x2, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z16.s }, p0/z, [x2, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x2]
-; VBITS_GE_256-NEXT:    ld1w { z4.d }, p3/z, [z23.d]
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z17.s, #0.0
-; VBITS_GE_256-NEXT:    mov z17.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z4.s, z4.s
-; VBITS_GE_256-NEXT:    bif v4.16b, v16.16b, v17.16b
-; VBITS_GE_256-NEXT:    ext z17.b, z17.b, z17.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z23.d, z17.s
-; VBITS_GE_256-NEXT:    ext z16.b, z16.b, z16.b, #16
-; VBITS_GE_256-NEXT:    cmpne p4.d, p1/z, z23.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z22.d }, p4/z, [z22.d]
-; VBITS_GE_256-NEXT:    ld1w { z21.d }, p2/z, [z21.d]
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z20.s, #0.0
-; VBITS_GE_256-NEXT:    mov z20.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p3.h, p2.b
-; VBITS_GE_256-NEXT:    and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT:    uzp1 z21.s, z21.s, z21.s
-; VBITS_GE_256-NEXT:    uzp1 z22.s, z22.s, z22.s
-; VBITS_GE_256-NEXT:    bif v21.16b, v5.16b, v20.16b
-; VBITS_GE_256-NEXT:    ext z20.b, z20.b, z20.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z23.d, z20.s
-; VBITS_GE_256-NEXT:    ext z5.b, z5.b, z5.b, #16
-; VBITS_GE_256-NEXT:    cmpne p4.d, p1/z, z23.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z19.d }, p4/z, [z19.d]
-; VBITS_GE_256-NEXT:    ld1w { z18.d }, p3/z, [z18.d]
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z7.s, #0.0
-; VBITS_GE_256-NEXT:    mov z7.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p2.h, p3.b
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    uzp1 z18.s, z18.s, z18.s
-; VBITS_GE_256-NEXT:    bif v18.16b, v1.16b, v7.16b
-; VBITS_GE_256-NEXT:    ext z7.b, z7.b, z7.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z23.d, z7.s
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    cmpne p4.d, p1/z, z23.d, #0
-; VBITS_GE_256-NEXT:    mov z23.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p4/z, [z2.d]
-; VBITS_GE_256-NEXT:    ld1w { z3.d }, p2/z, [z3.d]
-; VBITS_GE_256-NEXT:    bit v16.16b, v22.16b, v17.16b
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    bif v3.16b, v6.16b, v23.16b
-; VBITS_GE_256-NEXT:    ext z23.b, z23.b, z23.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z17.d, z23.s
-; VBITS_GE_256-NEXT:    ext z6.b, z6.b, z6.b, #16
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z17.d, #0
-; VBITS_GE_256-NEXT:    uzp1 z17.s, z19.s, z19.s
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p1/z, [z0.d]
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
-; VBITS_GE_256-NEXT:    bit v5.16b, v17.16b, v20.16b
-; VBITS_GE_256-NEXT:    splice z4.s, p1, z4.s, z16.s
-; VBITS_GE_256-NEXT:    bit v1.16b, v2.16b, v7.16b
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    bif v0.16b, v6.16b, v23.16b
-; VBITS_GE_256-NEXT:    splice z21.s, p1, z21.s, z5.s
-; VBITS_GE_256-NEXT:    splice z18.s, p1, z18.s, z1.s
-; VBITS_GE_256-NEXT:    st1w { z21.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z18.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    splice z3.s, p1, z3.s, z0.s
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_passthru:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x2]
-; VBITS_GE_2048-NEXT:    punpklo p2.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1w { z1.d }, p2/z, [z1.d]
-; VBITS_GE_2048-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_2048-NEXT:    mov z0.s, p1/m, z1.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_passthru(<32 x float>* %a, <32 x float*>* %b, <32 x float>* %c) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_passthru:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x2]
+; CHECK-NEXT:    punpklo p2.h, p1.b
+; CHECK-NEXT:    ld1w { z1.d }, p2/z, [z1.d]
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
   %ptrs = load <32 x float*>, <32 x float*>* %b
   %passthru = load <32 x float>, <32 x float>* %c
@@ -2842,97 +1253,19 @@ define void @masked_gather_passthru(<32 x float>* %a, <32 x float*>* %b, <32 x f
   ret void
 }
 
-define void @masked_gather_passthru_0(<32 x float>* %a, <32 x float*>* %b) #0 {
-; VBITS_GE_256-LABEL: masked_gather_passthru_0:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x14, #28
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x11, #4
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p1/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z18.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x12, #20
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT:    mov x13, #12
-; VBITS_GE_256-NEXT:    mov z19.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p4.s, p0/z, z2.s, #0.0
-; VBITS_GE_256-NEXT:    ext z19.b, z19.b, z19.b, #16
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z19.s
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p3.d, p1/z, z2.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p1/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p1/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p1/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p1/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p2/z, [z18.d]
-; VBITS_GE_256-NEXT:    ld1w { z17.d }, p3/z, [z17.d]
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z1.s, #0.0
-; VBITS_GE_256-NEXT:    mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z3.s, #0.0
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    mov z18.s, p4/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p3.h, p3.b
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    and p3.b, p3/z, p3.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p5.d, p1/z, z1.d, #0
-; VBITS_GE_256-NEXT:    punpklo p4.h, p4.b
-; VBITS_GE_256-NEXT:    ext z18.b, z18.b, z18.b, #16
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    ld1w { z16.d }, p3/z, [z16.d]
-; VBITS_GE_256-NEXT:    ld1w { z1.d }, p5/z, [z6.d]
-; VBITS_GE_256-NEXT:    and p4.b, p4/z, p4.b, p1.b
-; VBITS_GE_256-NEXT:    sunpklo z6.d, z18.s
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ld1w { z7.d }, p4/z, [z7.d]
-; VBITS_GE_256-NEXT:    cmpne p4.d, p1/z, z6.d, #0
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z3.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z5.d }, p4/z, [z5.d]
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p2/z, [z0.d]
-; VBITS_GE_256-NEXT:    ld1w { z3.d }, p1/z, [z4.d]
-; VBITS_GE_256-NEXT:    ptrue p3.s, vl4
-; VBITS_GE_256-NEXT:    uzp1 z4.s, z16.s, z16.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z17.s, z17.s, z17.s
-; VBITS_GE_256-NEXT:    splice z4.s, p3, z4.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z7.s, z7.s
-; VBITS_GE_256-NEXT:    uzp1 z5.s, z5.s, z5.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    splice z2.s, p3, z2.s, z17.s
-; VBITS_GE_256-NEXT:    splice z1.s, p3, z1.s, z5.s
-; VBITS_GE_256-NEXT:    splice z0.s, p3, z0.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_gather_passthru_0:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_gather_passthru_0(<32 x float>* %a, <32 x float*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_gather_passthru_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
   %ptrs = load <32 x float*>, <32 x float*>* %b
   %mask = fcmp oeq <32 x float> %cvals, zeroinitializer

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
index f4ce76e9986c..28e442e4cfe2 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -1,28 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
 ;
 ; Masked Loads
 ;
-define <2 x half> @masked_load_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
+
+define <2 x half> @masked_load_v2f16(<2 x half>* %ap, <2 x half>* %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_load_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s1, [x0]
@@ -53,7 +40,7 @@ define <2 x half> @masked_load_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
   ret <2 x half> %load
 }
 
-define <2 x float> @masked_load_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
+define <2 x float> @masked_load_v2f32(<2 x float>* %ap, <2 x float>* %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_load_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -71,7 +58,7 @@ define <2 x float> @masked_load_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
   ret <2 x float> %load
 }
 
-define <4 x float> @masked_load_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
+define <4 x float> @masked_load_v4f32(<4 x float>* %ap, <4 x float>* %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_load_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -89,7 +76,7 @@ define <4 x float> @masked_load_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
   ret <4 x float> %load
 }
 
-define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
+define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_load_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -107,6 +94,22 @@ define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
 }
 
 define <16 x float> @masked_load_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z0.s, z2.s
+; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z1.s, z3.s
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p2/z, [x0]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_v16f32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
@@ -123,16 +126,16 @@ define <16 x float> @masked_load_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0
   ret <16 x float> %load
 }
 
-define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
-; VBITS_GE_1024-LABEL: masked_load_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p1/z, [x0]
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_1024-NEXT:    ret
+define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_load_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
   %a = load <32 x float>, <32 x float>* %ap
   %b = load <32 x float>, <32 x float>* %bp
   %mask = fcmp oeq <32 x float> %a, %b
@@ -140,16 +143,16 @@ define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0
   ret <32 x float> %load
 }
 
-define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
   %a = load <64 x float>, <64 x float>* %ap
   %b = load <64 x float>, <64 x float>* %bp
   %mask = fcmp oeq <64 x float> %a, %b
@@ -158,6 +161,22 @@ define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0
 }
 
 define <64 x i8> @masked_load_v64i8(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w9, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x9]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x9]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.b, p0/z, z0.b, z2.b
+; VBITS_GE_256-NEXT:    cmpeq p2.b, p0/z, z1.b, z3.b
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0, x9]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p2/z, [x0]
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x8, x9]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_v64i8:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
@@ -175,6 +194,22 @@ define <64 x i8> @masked_load_v64i8(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
 }
 
 define <32 x i16> @masked_load_v32i16(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x9, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z0.h, z2.h
+; VBITS_GE_256-NEXT:    cmpeq p2.h, p0/z, z1.h, z3.h
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0, x9, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p2/z, [x0]
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_v32i16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
@@ -192,6 +227,22 @@ define <32 x i16> @masked_load_v32i16(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
 }
 
 define <16 x i32> @masked_load_v16i32(<16 x i32>* %ap, <16 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, z2.s
+; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, z3.s
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p2/z, [x0]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_v16i32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
@@ -209,6 +260,22 @@ define <16 x i32> @masked_load_v16i32(<16 x i32>* %ap, <16 x i32>* %bp) #0 {
 }
 
 define <8 x i64> @masked_load_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, z2.d
+; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, z3.d
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [x0]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_v8i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -226,6 +293,24 @@ define <8 x i64> @masked_load_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 {
 }
 
 define <8 x i64> @masked_load_passthru_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_passthru_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, z2.d
+; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, z3.d
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [x0]
+; VBITS_GE_256-NEXT:    sel z0.d, p1, z0.d, z2.d
+; VBITS_GE_256-NEXT:    sel z1.d, p2, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_passthru_v8i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -244,6 +329,24 @@ define <8 x i64> @masked_load_passthru_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0
 }
 
 define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_passthru_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z0.d, z2.d
+; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z1.d, z3.d
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [x0]
+; VBITS_GE_256-NEXT:    sel z0.d, p1, z0.d, z2.d
+; VBITS_GE_256-NEXT:    sel z1.d, p2, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_passthru_v8f64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -262,6 +365,21 @@ define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>*
 }
 
 define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov x9, #16
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    sunpklo z1.h, z0.b
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
@@ -278,6 +396,24 @@ define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0
 }
 
 define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ldr q0, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    cmeq v0.16b, v0.16b, #0
+; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
@@ -294,6 +430,25 @@ define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0
 }
 
 define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ldr d0, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl8
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    cmeq v0.8b, v0.8b, #0
+; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    sshll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -310,6 +465,21 @@ define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
 }
 
 define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
@@ -326,6 +496,24 @@ define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp)
 }
 
 define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ldr q0, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    cmeq v0.8h, v0.8h, #0
+; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -342,6 +530,21 @@ define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
 }
 
 define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -358,6 +561,21 @@ define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
 }
 
 define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov x9, #16
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    uunpklo z1.h, z0.b
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
@@ -374,6 +592,24 @@ define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0
 }
 
 define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ldr q0, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    cmeq v0.16b, v0.16b, #0
+; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
@@ -390,6 +626,25 @@ define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0
 }
 
 define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ldr d0, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl8
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    cmeq v0.8b, v0.8b, #0
+; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ushll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -406,6 +661,21 @@ define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
 }
 
 define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
@@ -422,6 +692,24 @@ define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp)
 }
 
 define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ldr q0, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    cmeq v0.8h, v0.8h, #0
+; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -438,6 +726,21 @@ define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
 }
 
 define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -454,6 +757,30 @@ define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
 }
 
 define <32 x i16> @masked_load_sext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16_m16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1, x9, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z1.h, #0
+; VBITS_GE_256-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
+; VBITS_GE_256-NEXT:    splice z1.b, p1, z1.b, z0.b
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
+; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    sunpklo z1.h, z0.b
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16_m16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
@@ -470,6 +797,33 @@ define <32 x i16> @masked_load_sext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp
 }
 
 define <16 x i32> @masked_load_sext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32_m32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, #0
+; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
+; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32_m32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
@@ -486,6 +840,35 @@ define <16 x i32> @masked_load_sext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp
 }
 
 define <8 x i64> @masked_load_sext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64_m64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl8
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z0.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    sshll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64_m64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -502,6 +885,32 @@ define <8 x i64> @masked_load_sext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0
 }
 
 define <16 x i32> @masked_load_sext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32_m32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, #0
+; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
+; VBITS_GE_256-NEXT:    sunpklo z0.h, z1.b
+; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32_m32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
@@ -518,6 +927,33 @@ define <16 x i32> @masked_load_sext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %
 }
 
 define <8 x i64> @masked_load_sext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64_m64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64_m64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -534,6 +970,30 @@ define <8 x i64> @masked_load_sext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp)
 }
 
 define <8 x i64> @masked_load_sext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64_m64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
+; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64_m64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -550,6 +1010,30 @@ define <8 x i64> @masked_load_sext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp)
 }
 
 define <32 x i16> @masked_load_zext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16_m16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1, x9, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z1.h, #0
+; VBITS_GE_256-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
+; VBITS_GE_256-NEXT:    splice z1.b, p1, z1.b, z0.b
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
+; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    uunpklo z1.h, z0.b
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16_m16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
@@ -566,6 +1050,33 @@ define <32 x i16> @masked_load_zext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp
 }
 
 define <16 x i32> @masked_load_zext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32_m32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, #0
+; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
+; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32_m32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
@@ -582,6 +1093,35 @@ define <16 x i32> @masked_load_zext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp
 }
 
 define <8 x i64> @masked_load_zext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64_m64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl8
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z0.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    ushll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64_m64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -598,6 +1138,32 @@ define <8 x i64> @masked_load_zext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0
 }
 
 define <16 x i32> @masked_load_zext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32_m32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, #0
+; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
+; VBITS_GE_256-NEXT:    sunpklo z0.h, z1.b
+; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32_m32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
@@ -614,6 +1180,33 @@ define <16 x i32> @masked_load_zext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %
 }
 
 define <8 x i64> @masked_load_zext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64_m64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64_m64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -630,6 +1223,30 @@ define <8 x i64> @masked_load_zext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp)
 }
 
 define <8 x i64> @masked_load_zext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64_m64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
+; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64_m64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -645,15 +1262,15 @@ define <8 x i64> @masked_load_zext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp)
   ret <8 x i64> %ext
 }
 
-define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_sext_v128i8i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1b { z0.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
-; VBITS_GE_2048-NEXT:    ld1sb { z0.h }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_sext_v128i8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x8]
+; CHECK-NEXT:    ret
   %b = load <128 x i8>, <128 x i8>* %bp
   %mask = icmp eq <128 x i8> %b, zeroinitializer
   %load = call <128 x i8> @llvm.masked.load.v128i8(<128 x i8>* %ap, i32 8, <128 x i1> %mask, <128 x i8> undef)
@@ -661,15 +1278,15 @@ define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp)
   ret <128 x i16> %ext
 }
 
-define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_sext_v64i8i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1b { z0.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT:    ld1sb { z0.s }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_sext_v64i8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1sb { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
   %b = load <64 x i8>, <64 x i8>* %bp
   %mask = icmp eq <64 x i8> %b, zeroinitializer
   %load = call <64 x i8> @llvm.masked.load.v64i8(<64 x i8>* %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
@@ -677,15 +1294,15 @@ define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0
   ret <64 x i32> %ext
 }
 
-define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_sext_v32i8i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1b { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT:    ld1sb { z0.d }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_sext_v32i8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1sb { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ret
   %b = load <32 x i8>, <32 x i8>* %bp
   %mask = icmp eq <32 x i8> %b, zeroinitializer
   %load = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
@@ -693,15 +1310,15 @@ define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0
   ret <32 x i64> %ext
 }
 
-define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_sext_v64i16i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1h { z0.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT:    ld1sh { z0.s }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_sext_v64i16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1sh { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
   %b = load <64 x i16>, <64 x i16>* %bp
   %mask = icmp eq <64 x i16> %b, zeroinitializer
   %load = call <64 x i16> @llvm.masked.load.v64i16(<64 x i16>* %ap, i32 8, <64 x i1> %mask, <64 x i16> undef)
@@ -709,15 +1326,15 @@ define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp)
   ret <64 x i32> %ext
 }
 
-define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_sext_v32i16i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT:    ld1sh { z0.d }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_sext_v32i16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1sh { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ret
   %b = load <32 x i16>, <32 x i16>* %bp
   %mask = icmp eq <32 x i16> %b, zeroinitializer
   %load = call <32 x i16> @llvm.masked.load.v32i16(<32 x i16>* %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
@@ -725,15 +1342,15 @@ define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp)
   ret <32 x i64> %ext
 }
 
-define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_sext_v32i32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT:    ld1sw { z0.d }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_sext_v32i32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1sw { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ret
   %b = load <32 x i32>, <32 x i32>* %bp
   %mask = icmp eq <32 x i32> %b, zeroinitializer
   %load = call <32 x i32> @llvm.masked.load.v32i32(<32 x i32>* %ap, i32 8, <32 x i1> %mask, <32 x i32> undef)
@@ -741,15 +1358,15 @@ define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp)
   ret <32 x i64> %ext
 }
 
-define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_zext_v128i8i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1b { z0.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
-; VBITS_GE_2048-NEXT:    ld1b { z0.h }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_zext_v128i8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ld1b { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x8]
+; CHECK-NEXT:    ret
   %b = load <128 x i8>, <128 x i8>* %bp
   %mask = icmp eq <128 x i8> %b, zeroinitializer
   %load = call <128 x i8> @llvm.masked.load.v128i8(<128 x i8>* %ap, i32 8, <128 x i1> %mask, <128 x i8> undef)
@@ -757,15 +1374,15 @@ define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp)
   ret <128 x i16> %ext
 }
 
-define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_zext_v64i8i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1b { z0.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT:    ld1b { z0.s }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_zext_v64i8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1b { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
   %b = load <64 x i8>, <64 x i8>* %bp
   %mask = icmp eq <64 x i8> %b, zeroinitializer
   %load = call <64 x i8> @llvm.masked.load.v64i8(<64 x i8>* %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
@@ -773,15 +1390,15 @@ define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0
   ret <64 x i32> %ext
 }
 
-define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_zext_v32i8i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1b { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT:    ld1b { z0.d }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_zext_v32i8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1b { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ret
   %b = load <32 x i8>, <32 x i8>* %bp
   %mask = icmp eq <32 x i8> %b, zeroinitializer
   %load = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
@@ -789,15 +1406,15 @@ define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0
   ret <32 x i64> %ext
 }
 
-define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_zext_v64i16i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1h { z0.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT:    ld1h { z0.s }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_zext_v64i16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
   %b = load <64 x i16>, <64 x i16>* %bp
   %mask = icmp eq <64 x i16> %b, zeroinitializer
   %load = call <64 x i16> @llvm.masked.load.v64i16(<64 x i16>* %ap, i32 8, <64 x i1> %mask, <64 x i16> undef)
@@ -805,15 +1422,15 @@ define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp)
   ret <64 x i32> %ext
 }
 
-define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_zext_v32i16i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT:    ld1h { z0.d }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_zext_v32i16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1h { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ret
   %b = load <32 x i16>, <32 x i16>* %bp
   %mask = icmp eq <32 x i16> %b, zeroinitializer
   %load = call <32 x i16> @llvm.masked.load.v32i16(<32 x i16>* %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
@@ -821,15 +1438,15 @@ define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp)
   ret <32 x i64> %ext
 }
 
-define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 {
-; VBITS_GE_2048-LABEL: masked_load_zext_v32i32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p1/z, [x0]
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x8]
-; VBITS_GE_2048-NEXT:    ret
+define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_load_zext_v32i32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ret
   %b = load <32 x i32>, <32 x i32>* %bp
   %mask = icmp eq <32 x i32> %b, zeroinitializer
   %load = call <32 x i32> @llvm.masked.load.v32i32(<32 x i32>* %ap, i32 8, <32 x i1> %mask, <32 x i32> undef)
@@ -838,6 +1455,21 @@ define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp)
 }
 
 define <8 x i64> @masked_load_sext_ugt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_sext_ugt_v8i32i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -854,6 +1486,21 @@ define <8 x i64> @masked_load_sext_ugt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp)
 }
 
 define <8 x i64> @masked_load_zext_sgt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
+; VBITS_GE_256-LABEL: masked_load_zext_sgt_v8i32i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpgt p0.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index edf937ab562e..58834bf39eb8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -1,31 +1,15 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function masked_scatter_v8i8,masked_scatter_v8i16,masked_scatter_v8i32,masked_scatter_v8i64 --prefix VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; ST1B
 ;
 
-define void @masked_scatter_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
+define void @masked_scatter_v2i8(<2 x i8>* %a, <2 x i8*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w8, [x0]
@@ -47,7 +31,7 @@ define void @masked_scatter_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
+define void @masked_scatter_v4i8(<4 x i8>* %a, <4 x i8*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v4i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
@@ -70,36 +54,36 @@ define void @masked_scatter_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
 }
 
 define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-LABEL: masked_scatter_v8i8:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ldr d0, [x0]
-; VBITS_EQ_256-NEXT:    mov x8, #4
-; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT:    cmeq v1.8b, v0.8b, #0
-; VBITS_EQ_256-NEXT:    zip1 v5.8b, v0.8b, v0.8b
-; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    zip1 v2.8b, v1.8b, v0.8b
-; VBITS_EQ_256-NEXT:    zip2 v1.8b, v1.8b, v0.8b
-; VBITS_EQ_256-NEXT:    zip2 v0.8b, v0.8b, v0.8b
-; VBITS_EQ_256-NEXT:    shl v2.4h, v2.4h, #8
-; VBITS_EQ_256-NEXT:    shl v1.4h, v1.4h, #8
-; VBITS_EQ_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_EQ_256-NEXT:    sshr v2.4h, v2.4h, #8
-; VBITS_EQ_256-NEXT:    sshr v1.4h, v1.4h, #8
-; VBITS_EQ_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_EQ_256-NEXT:    sunpklo z2.s, z2.h
-; VBITS_EQ_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_EQ_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_EQ_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_EQ_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
-; VBITS_EQ_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_EQ_256-NEXT:    uunpklo z1.s, z5.h
-; VBITS_EQ_256-NEXT:    uunpklo z1.d, z1.s
-; VBITS_EQ_256-NEXT:    st1b { z1.d }, p1, [z4.d]
-; VBITS_EQ_256-NEXT:    st1b { z0.d }, p0, [z3.d]
-; VBITS_EQ_256-NEXT:    ret
+; VBITS_GE_256-LABEL: masked_scatter_v8i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ldr d0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    cmeq v1.8b, v0.8b, #0
+; VBITS_GE_256-NEXT:    zip1 v5.8b, v0.8b, v0.8b
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    zip1 v2.8b, v1.8b, v0.8b
+; VBITS_GE_256-NEXT:    zip2 v1.8b, v1.8b, v0.8b
+; VBITS_GE_256-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; VBITS_GE_256-NEXT:    shl v2.4h, v2.4h, #8
+; VBITS_GE_256-NEXT:    shl v1.4h, v1.4h, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sshr v2.4h, v2.4h, #8
+; VBITS_GE_256-NEXT:    sshr v1.4h, v1.4h, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
+; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z5.h
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1b { z1.d }, p1, [z4.d]
+; VBITS_GE_256-NEXT:    st1b { z0.d }, p0, [z3.d]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i8:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ldr d0, [x0]
@@ -122,22 +106,22 @@ define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ldr q0, [x0]
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    cmeq v2.16b, v0.16b, #0
-; VBITS_GE_1024-NEXT:    uunpklo z0.h, z0.b
-; VBITS_GE_1024-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    sunpklo z2.h, z2.b
-; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_1024-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_1024-NEXT:    cmpne p0.d, p0/z, z2.d, #0
-; VBITS_GE_1024-NEXT:    st1b { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_scatter_v16i8(<16 x i8>* %a, <16 x i8*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmeq v2.16b, v0.16b, #0
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    cmpne p0.d, p0/z, z2.d, #0
+; CHECK-NEXT:    st1b { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <16 x i8>, <16 x i8>* %a
   %ptrs = load <16 x i8*>, <16 x i8*>* %b
   %mask = icmp eq <16 x i8> %vals, zeroinitializer
@@ -145,22 +129,22 @@ define void @masked_scatter_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
-; VBITS_GE_2048-NEXT:    uunpklo z0.h, z0.b
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1b { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_v32i8(<32 x i8>* %a, <32 x i8*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1b { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <32 x i8>, <32 x i8>* %a
   %ptrs = load <32 x i8*>, <32 x i8*>* %b
   %mask = icmp eq <32 x i8> %vals, zeroinitializer
@@ -172,7 +156,7 @@ define void @masked_scatter_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
 ; ST1H
 ;
 
-define void @masked_scatter_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
+define void @masked_scatter_v2i16(<2 x i16>* %a, <2 x i16*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
@@ -194,7 +178,7 @@ define void @masked_scatter_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
+define void @masked_scatter_v4i16(<4 x i16>* %a, <4 x i16*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -216,30 +200,30 @@ define void @masked_scatter_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
 }
 
 define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-LABEL: masked_scatter_v8i16:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ldr q0, [x0]
-; VBITS_EQ_256-NEXT:    mov x8, #4
-; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT:    cmeq v1.8h, v0.8h, #0
-; VBITS_EQ_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_EQ_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_EQ_256-NEXT:    sunpklo z2.s, z1.h
-; VBITS_EQ_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_EQ_256-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; VBITS_EQ_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_EQ_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
-; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    uunpklo z3.s, z3.h
-; VBITS_EQ_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_EQ_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_EQ_256-NEXT:    st1h { z0.d }, p1, [z2.d]
-; VBITS_EQ_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_EQ_256-NEXT:    uunpklo z1.d, z3.s
-; VBITS_EQ_256-NEXT:    st1h { z1.d }, p0, [z4.d]
-; VBITS_EQ_256-NEXT:    ret
+; VBITS_GE_256-LABEL: masked_scatter_v8i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ldr q0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    cmeq v1.8h, v0.8h, #0
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1h { z0.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z3.s
+; VBITS_GE_256-NEXT:    st1h { z1.d }, p0, [z4.d]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ldr q0, [x0]
@@ -260,20 +244,20 @@ define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT:    ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
-; VBITS_GE_1024-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_1024-NEXT:    st1h { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_scatter_v16i16(<16 x i16>* %a, <16 x i16*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ptrue p1.d, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <16 x i16>, <16 x i16>* %a
   %ptrs = load <16 x i16*>, <16 x i16*>* %b
   %mask = icmp eq <16 x i16> %vals, zeroinitializer
@@ -281,20 +265,20 @@ define void @masked_scatter_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
-; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1h { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_v32i16(<32 x i16>* %a, <32 x i16*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <32 x i16>, <32 x i16>* %a
   %ptrs = load <32 x i16*>, <32 x i16*>* %b
   %mask = icmp eq <32 x i16> %vals, zeroinitializer
@@ -306,7 +290,7 @@ define void @masked_scatter_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
 ; ST1W
 ;
 
-define void @masked_scatter_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
+define void @masked_scatter_v2i32(<2 x i32>* %a, <2 x i32*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -325,7 +309,7 @@ define void @masked_scatter_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
+define void @masked_scatter_v4i32(<4 x i32>* %a, <4 x i32*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -345,28 +329,28 @@ define void @masked_scatter_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
 }
 
 define void @masked_scatter_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-LABEL: masked_scatter_v8i32:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT:    mov x8, #4
-; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ptrue p1.d, vl4
-; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
-; VBITS_EQ_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
-; VBITS_EQ_256-NEXT:    uunpklo z4.d, z0.s
-; VBITS_EQ_256-NEXT:    mov z2.s, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_EQ_256-NEXT:    punpklo p0.h, p0.b
-; VBITS_EQ_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_EQ_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_EQ_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_EQ_256-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; VBITS_EQ_256-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; VBITS_EQ_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_EQ_256-NEXT:    st1w { z4.d }, p0, [z3.d]
-; VBITS_EQ_256-NEXT:    st1w { z0.d }, p1, [z1.d]
-; VBITS_EQ_256-NEXT:    ret
+; VBITS_GE_256-LABEL: masked_scatter_v8i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT:    uunpklo z4.d, z0.s
+; VBITS_GE_256-NEXT:    mov z2.s, p0/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    punpklo p0.h, p0.b
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT:    and p0.b, p0/z, p0.b, p1.b
+; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z2.d, #0
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1w { z4.d }, p0, [z3.d]
+; VBITS_GE_256-NEXT:    st1w { z0.d }, p1, [z1.d]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
@@ -385,18 +369,18 @@ define void @masked_scatter_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
-; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_1024-NEXT:    st1w { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_scatter_v16i32(<16 x i32>* %a, <16 x i32*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ptrue p1.d, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <16 x i32>, <16 x i32>* %a
   %ptrs = load <16 x i32*>, <16 x i32*>* %b
   %mask = icmp eq <16 x i32> %vals, zeroinitializer
@@ -404,18 +388,18 @@ define void @masked_scatter_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1w { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_v32i32(<32 x i32>* %a, <32 x i32*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <32 x i32>, <32 x i32>* %a
   %ptrs = load <32 x i32*>, <32 x i32*>* %b
   %mask = icmp eq <32 x i32> %vals, zeroinitializer
@@ -428,7 +412,7 @@ define void @masked_scatter_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
 ;
 
 ; Scalarize 1 x i64 scatters
-define void @masked_scatter_v1i64(<1 x i64>* %a, <1 x i64*>* %b) #0 {
+define void @masked_scatter_v1i64(<1 x i64>* %a, <1 x i64*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -447,7 +431,7 @@ define void @masked_scatter_v1i64(<1 x i64>* %a, <1 x i64*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
+define void @masked_scatter_v2i64(<2 x i64>* %a, <2 x i64*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -464,7 +448,7 @@ define void @masked_scatter_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
+define void @masked_scatter_v4i64(<4 x i64>* %a, <4 x i64*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -481,20 +465,20 @@ define void @masked_scatter_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
 }
 
 define void @masked_scatter_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-LABEL: masked_scatter_v8i64:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #4
-; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_EQ_256-NEXT:    cmpeq p0.d, p0/z, z1.d, #0
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [z3.d]
-; VBITS_EQ_256-NEXT:    st1d { z0.d }, p1, [z2.d]
-; VBITS_EQ_256-NEXT:    ret
+; VBITS_GE_256-LABEL: masked_scatter_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [z3.d]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -510,15 +494,15 @@ define void @masked_scatter_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_scatter_v16i64(<16 x i64>* %a, <16 x i64*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <16 x i64>, <16 x i64>* %a
   %ptrs = load <16 x i64*>, <16 x i64*>* %b
   %mask = icmp eq <16 x i64> %vals, zeroinitializer
@@ -526,15 +510,15 @@ define void @masked_scatter_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_v32i64(<32 x i64>* %a, <32 x i64*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <32 x i64>, <32 x i64>* %a
   %ptrs = load <32 x i64*>, <32 x i64*>* %b
   %mask = icmp eq <32 x i64> %vals, zeroinitializer
@@ -546,7 +530,7 @@ define void @masked_scatter_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 {
 ; ST1H (float)
 ;
 
-define void @masked_scatter_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 {
+define void @masked_scatter_v2f16(<2 x half>* %a, <2 x half*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s1, [x0]
@@ -580,7 +564,7 @@ define void @masked_scatter_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v4f16(<4 x half>* %a, <4 x half*>* %b) #0 {
+define void @masked_scatter_v4f16(<4 x half>* %a, <4 x half*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -602,6 +586,30 @@ define void @masked_scatter_v4f16(<4 x half>* %a, <4 x half*>* %b) #0 {
 }
 
 define void @masked_scatter_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 {
+; VBITS_GE_256-LABEL: masked_scatter_v8f16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ldr q0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    fcmeq v1.8h, v0.8h, #0.0
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1h { z0.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z3.s
+; VBITS_GE_256-NEXT:    st1h { z1.d }, p0, [z4.d]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_scatter_v8f16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ldr q0, [x0]
@@ -622,20 +630,20 @@ define void @masked_scatter_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT:    ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
-; VBITS_GE_1024-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_1024-NEXT:    st1h { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_scatter_v16f16(<16 x half>* %a, <16 x half*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ptrue p1.d, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <16 x half>, <16 x half>* %a
   %ptrs = load <16 x half*>, <16 x half*>* %b
   %mask = fcmp oeq <16 x half> %vals, zeroinitializer
@@ -643,20 +651,20 @@ define void @masked_scatter_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
-; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1h { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_v32f16(<32 x half>* %a, <32 x half*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <32 x half>, <32 x half>* %a
   %ptrs = load <32 x half*>, <32 x half*>* %b
   %mask = fcmp oeq <32 x half> %vals, zeroinitializer
@@ -668,7 +676,7 @@ define void @masked_scatter_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 {
 ; ST1W (float)
 ;
 
-define void @masked_scatter_v2f32(<2 x float>* %a, <2 x float*>* %b) #0 {
+define void @masked_scatter_v2f32(<2 x float>* %a, <2 x float*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -687,7 +695,7 @@ define void @masked_scatter_v2f32(<2 x float>* %a, <2 x float*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v4f32(<4 x float>* %a, <4 x float*>* %b) #0 {
+define void @masked_scatter_v4f32(<4 x float>* %a, <4 x float*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -707,6 +715,28 @@ define void @masked_scatter_v4f32(<4 x float>* %a, <4 x float*>* %b) #0 {
 }
 
 define void @masked_scatter_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 {
+; VBITS_GE_256-LABEL: masked_scatter_v8f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; VBITS_GE_256-NEXT:    uunpklo z4.d, z0.s
+; VBITS_GE_256-NEXT:    mov z2.s, p0/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    punpklo p0.h, p0.b
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT:    and p0.b, p0/z, p0.b, p1.b
+; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z2.d, #0
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    st1w { z4.d }, p0, [z3.d]
+; VBITS_GE_256-NEXT:    st1w { z0.d }, p1, [z1.d]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_scatter_v8f32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
@@ -725,18 +755,18 @@ define void @masked_scatter_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT:    ptrue p1.d, vl16
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_1024-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_1024-NEXT:    st1w { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_scatter_v16f32(<16 x float>* %a, <16 x float*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ptrue p1.d, vl16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <16 x float>, <16 x float>* %a
   %ptrs = load <16 x float*>, <16 x float*>* %b
   %mask = fcmp oeq <16 x float> %vals, zeroinitializer
@@ -744,18 +774,18 @@ define void @masked_scatter_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1w { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_v32f32(<32 x float>* %a, <32 x float*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <32 x float>, <32 x float>* %a
   %ptrs = load <32 x float*>, <32 x float*>* %b
   %mask = fcmp oeq <32 x float> %vals, zeroinitializer
@@ -768,7 +798,7 @@ define void @masked_scatter_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 {
 ;
 
 ; Scalarize 1 x double scatters
-define void @masked_scatter_v1f64(<1 x double>* %a, <1 x double*>* %b) #0 {
+define void @masked_scatter_v1f64(<1 x double>* %a, <1 x double*>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: masked_scatter_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -787,7 +817,7 @@ define void @masked_scatter_v1f64(<1 x double>* %a, <1 x double*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v2f64(<2 x double>* %a, <2 x double*>* %b) #0 {
+define void @masked_scatter_v2f64(<2 x double>* %a, <2 x double*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -804,7 +834,7 @@ define void @masked_scatter_v2f64(<2 x double>* %a, <2 x double*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 {
+define void @masked_scatter_v4f64(<4 x double>* %a, <4 x double*>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -821,6 +851,20 @@ define void @masked_scatter_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 {
 }
 
 define void @masked_scatter_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 {
+; VBITS_GE_256-LABEL: masked_scatter_v8f64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
+; VBITS_GE_256-NEXT:    fcmeq p0.d, p0/z, z1.d, #0.0
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [z3.d]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT:    ret
+;
 ; VBITS_GE_512-LABEL: masked_scatter_v8f64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
@@ -836,15 +880,15 @@ define void @masked_scatter_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 {
-; VBITS_GE_1024-LABEL: masked_scatter_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [z1.d]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_scatter_v16f64(<16 x double>* %a, <16 x double*>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_scatter_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <16 x double>, <16 x double>* %a
   %ptrs = load <16 x double*>, <16 x double*>* %b
   %mask = fcmp oeq <16 x double> %vals, zeroinitializer
@@ -852,15 +896,15 @@ define void @masked_scatter_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 {
   ret void
 }
 
-define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [z1.d]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
   %vals = load <32 x double>, <32 x double>* %a
   %ptrs = load <32 x double*>, <32 x double*>* %b
   %mask = fcmp oeq <32 x double> %vals, zeroinitializer
@@ -871,18 +915,18 @@ define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
 ; The above tests test the types, the below tests check that the addressing
 ; modes still function
 
-define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_sext_f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.s, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
-; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x2, z1.s, sxtw #1]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_32b_scaled_sext_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1h { z0.s }, p0, [x2, z1.s, sxtw #1]
+; CHECK-NEXT:    ret
   %vals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
   %ext = sext <32 x i32> %idxs to <32 x i64>
@@ -892,15 +936,15 @@ define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
   ret void
 }
 
-define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_sext_f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x2, z1.s, sxtw #2]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_32b_scaled_sext_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2, z1.s, sxtw #2]
+; CHECK-NEXT:    ret
   %vals = load <32 x float>, <32 x float>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
   %ext = sext <32 x i32> %idxs to <32 x i64>
@@ -910,15 +954,15 @@ define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b
   ret void
 }
 
-define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_sext_f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x2, z1.d, lsl #3]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_32b_scaled_sext_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1sw { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2, z1.d, lsl #3]
+; CHECK-NEXT:    ret
   %vals = load <32 x double>, <32 x double>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
   %ext = sext <32 x i32> %idxs to <32 x i64>
@@ -928,18 +972,18 @@ define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %
   ret void
 }
 
-define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_zext:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.s, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
-; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x2, z1.s, uxtw #1]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_32b_scaled_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1h { z0.s }, p0, [x2, z1.s, uxtw #1]
+; CHECK-NEXT:    ret
   %vals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
   %ext = zext <32 x i32> %idxs to <32 x i64>
@@ -949,18 +993,18 @@ define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, hal
   ret void
 }
 
-define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_32b_unscaled_sext:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.s, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
-; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x2, z1.s, sxtw]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_32b_unscaled_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1h { z0.s }, p0, [x2, z1.s, sxtw]
+; CHECK-NEXT:    ret
   %vals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
   %ext = sext <32 x i32> %idxs to <32 x i64>
@@ -971,18 +1015,18 @@ define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i
   ret void
 }
 
-define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_32b_unscaled_zext:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.s, vl32
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
-; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x2, z1.s, uxtw]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_32b_unscaled_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1h { z0.s }, p0, [x2, z1.s, uxtw]
+; CHECK-NEXT:    ret
   %vals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
   %ext = zext <32 x i32> %idxs to <32 x i64>
@@ -993,18 +1037,18 @@ define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i
   ret void
 }
 
-define void @masked_scatter_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_64b_scaled:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1w { z0.d }, p0, [x2, z1.d, lsl #2]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_64b_scaled:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1w { z0.d }, p0, [x2, z1.d, lsl #2]
+; CHECK-NEXT:    ret
   %vals = load <32 x float>, <32 x float>* %a
   %idxs = load <32 x i64>, <32 x i64>* %b
   %ptrs = getelementptr float, float* %base, <32 x i64> %idxs
@@ -1013,18 +1057,18 @@ define void @masked_scatter_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float*
   ret void
 }
 
-define void @masked_scatter_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_64b_unscaled:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1w { z0.d }, p0, [x2, z1.d]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_64b_unscaled:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1w { z0.d }, p0, [x2, z1.d]
+; CHECK-NEXT:    ret
   %vals = load <32 x float>, <32 x float>* %a
   %idxs = load <32 x i64>, <32 x i64>* %b
   %byte_ptrs = getelementptr i8, i8* %base, <32 x i64> %idxs
@@ -1034,18 +1078,18 @@ define void @masked_scatter_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %
   ret void
 }
 
-define void @masked_scatter_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_vec_plus_reg:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1w { z0.d }, p0, [x2, z1.d]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_vec_plus_reg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1w { z0.d }, p0, [x2, z1.d]
+; CHECK-NEXT:    ret
   %vals = load <32 x float>, <32 x float>* %a
   %bases = load <32 x i8*>, <32 x i8*>* %b
   %byte_ptrs = getelementptr i8, <32 x i8*> %bases, i64 %off
@@ -1055,18 +1099,18 @@ define void @masked_scatter_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %
   ret void
 }
 
-define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
-; VBITS_GE_2048-LABEL: masked_scatter_vec_plus_imm:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1w { z0.d }, p0, [z1.d, #4]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_scatter_vec_plus_imm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d, #4]
+; CHECK-NEXT:    ret
   %vals = load <32 x float>, <32 x float>* %a
   %bases = load <32 x i8*>, <32 x i8*>* %b
   %byte_ptrs = getelementptr i8, <32 x i8*> %bases, i64 4
@@ -1084,18 +1128,18 @@ define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
 ; NOTE: For this test to function correctly it's critical for %vals to be in a
 ; 
diff erent block to the scatter store.  If not, the problematic bitcast will be
 ; removed before operation legalisation and thus not exercise the combine.
-define void @masked_scatter_bitcast_infinite_loop(<8 x double>* %a, <8 x double*>* %b, i1 %cond) #0 {
-; VBITS_GE_512-LABEL: masked_scatter_bitcast_infinite_loop:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    tbz w2, #0, .LBB47_2
-; VBITS_GE_512-NEXT:  // %bb.1: // %bb.1
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [z1.d]
-; VBITS_GE_512-NEXT:  .LBB47_2: // %bb.2
-; VBITS_GE_512-NEXT:    ret
+define void @masked_scatter_bitcast_infinite_loop(<8 x double>* %a, <8 x double*>* %b, i1 %cond) vscale_range(4,0) #0 {
+; CHECK-LABEL: masked_scatter_bitcast_infinite_loop:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl8
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    tbz w2, #0, .LBB47_2
+; CHECK-NEXT:  // %bb.1: // %bb.1
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:  .LBB47_2: // %bb.2
+; CHECK-NEXT:    ret
   %vals = load volatile <8 x double>, <8 x double>* %a
   br i1 %cond, label %bb.1, label %bb.2
 

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
index 0e64e78d5505..3d6099e9a792 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
@@ -1,28 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-;;
-;; Masked Stores
-;;
-define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
+;
+; Masked Stores
+;
+
+define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_store_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s1, [x0]
@@ -52,8 +39,7 @@ define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
   ret void
 }
 
-
-define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
+define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_store_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
@@ -70,7 +56,7 @@ define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
   ret void
 }
 
-define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
+define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_store_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -87,7 +73,7 @@ define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
   ret void
 }
 
-define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
+define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_store_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -133,39 +119,15 @@ define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 {
   ret void
 }
 
-define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
-; VBITS_GE_256-LABEL: masked_store_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z2.s, z5.s
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z1.s, z4.s
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z0.s, z6.s
-; VBITS_GE_256-NEXT:    fcmeq p0.s, p0/z, z3.s, z7.s
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p3, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p2, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p1, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: masked_store_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    fcmeq p0.s, p0/z, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) vscale_range(8,0) #0 {
+; CHECK-LABEL: masked_store_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %a = load <32 x float>, <32 x float>* %ap
   %b = load <32 x float>, <32 x float>* %bp
   %mask = fcmp oeq <32 x float> %a, %b
@@ -173,59 +135,15 @@ define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
   ret void
 }
 
-define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 {
-; VBITS_GE_256-LABEL: masked_store_v64f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #56
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    mov x10, #40
-; VBITS_GE_256-NEXT:    mov x11, #32
-; VBITS_GE_256-NEXT:    mov x12, #24
-; VBITS_GE_256-NEXT:    mov x13, #16
-; VBITS_GE_256-NEXT:    mov x14, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z17.s }, p0/z, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z18.s }, p0/z, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z19.s }, p0/z, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z20.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z21.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z22.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z23.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z6.s, z17.s
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z5.s, z16.s
-; VBITS_GE_256-NEXT:    fcmeq p3.s, p0/z, z4.s, z19.s
-; VBITS_GE_256-NEXT:    fcmeq p4.s, p0/z, z3.s, z18.s
-; VBITS_GE_256-NEXT:    fcmeq p5.s, p0/z, z2.s, z21.s
-; VBITS_GE_256-NEXT:    fcmeq p6.s, p0/z, z1.s, z20.s
-; VBITS_GE_256-NEXT:    fcmeq p7.s, p0/z, z0.s, z22.s
-; VBITS_GE_256-NEXT:    fcmeq p0.s, p0/z, z7.s, z23.s
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p7, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p6, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p5, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p4, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p3, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p2, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p1, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: masked_store_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    fcmeq p0.s, p0/z, z0.s, z1.s
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) vscale_range(16,0) #0 {
+; CHECK-LABEL: masked_store_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %a = load <64 x float>, <64 x float>* %ap
   %b = load <64 x float>, <64 x float>* %bp
   %mask = fcmp oeq <64 x float> %a, %b
@@ -266,7 +184,6 @@ define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>
 ; VBITS_GE_512-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
 ; VBITS_GE_512-NEXT:    st1b { z0.d }, p0, [x2]
 ; VBITS_GE_512-NEXT:    ret
-
   %a = load <8 x i64>, <8 x i64>* %ap
   %b = load <8 x i64>, <8 x i64>* %bp
   %mask = icmp eq <8 x i64> %a, %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
index 4156689233fc..27389a7f1eef 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -163,27 +163,27 @@ define void @test_revwv8i32v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 
 ; REVH pattern for shuffle v32i16 with 256 bits and 512 bits SVE.
 define void @test_revhv32i16(<32 x i16>* %a) #0 {
-; VBITS_EQ_256-LABEL: test_revhv32i16:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #16
-; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
-; VBITS_EQ_256-NEXT:    ptrue p1.d
-; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    revh z0.d, p1/m, z0.d
-; VBITS_EQ_256-NEXT:    revh z1.d, p1/m, z1.d
-; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
 ; VBITS_GE_256-LABEL: test_revhv32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    ptrue p1.d
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    revh z0.d, p1/m, z0.d
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    revh z1.d, p1/m, z1.d
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: test_revhv32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ptrue p1.d
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    revh z0.d, p1/m, z0.d
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %tmp1 = load <32 x i16>, <32 x i16>* %a
   %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
   store <32 x i16> %tmp2, <32 x i16>* %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll
index e0857f4c71c7..24d2095b57af 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll
@@ -1,54 +1,46 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; RBIT
 ;
 
-define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) #0 {
+define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v8i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
-; CHECK-NEXT: rbit z0.b, [[PG]]/m, z0.b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
 
-define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) #0 {
+define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v16i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
-; CHECK-NEXT: rbit z0.b, [[PG]]/m, z0.b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
 
-define void @bitreverse_v32i8(<32 x i8>* %a) #0 {
+define void @bitreverse_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v32i8:
-; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
-; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i8>, <32 x i8>* %a
   %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op)
   store <32 x i8> %res, <32 x i8>* %a
@@ -56,80 +48,91 @@ define void @bitreverse_v32i8(<32 x i8>* %a) #0 {
 }
 
 define void @bitreverse_v64i8(<64 x i8>* %a) #0 {
-; CHECK-LABEL: bitreverse_v64i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
-; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: bitreverse_v64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    rbit z0.b, p0/m, z0.b
+; VBITS_GE_256-NEXT:    rbit z1.b, p0/m, z1.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
 ;
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
-; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
-; VBITS_EQ_256-DAG: ld1b { [[OP_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1b { [[OP_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP_LO]].b
-; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP_HI]].b
-; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_512-LABEL: bitreverse_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    rbit z0.b, p0/m, z0.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, <64 x i8>* %a
   %res = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %op)
   store <64 x i8> %res, <64 x i8>* %a
   ret void
 }
 
-define void @bitreverse_v128i8(<128 x i8>* %a) #0 {
+define void @bitreverse_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: bitreverse_v128i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
-; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x i8>, <128 x i8>* %a
   %res = call <128 x i8> @llvm.bitreverse.v128i8(<128 x i8> %op)
   store <128 x i8> %res, <128 x i8>* %a
   ret void
 }
 
-define void @bitreverse_v256i8(<256 x i8>* %a) #0 {
+define void @bitreverse_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: bitreverse_v256i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
-; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
-; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <256 x i8>, <256 x i8>* %a
   %res = call <256 x i8> @llvm.bitreverse.v256i8(<256 x i8> %op)
   store <256 x i8> %res, <256 x i8>* %a
   ret void
 }
 
-define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) #0 {
+define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v4i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
-; CHECK-NEXT: rbit z0.h, [[PG]]/m, z0.h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
 
-define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) #0 {
+define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v8i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
-; CHECK-NEXT: rbit z0.h, [[PG]]/m, z0.h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
 
-define void @bitreverse_v16i16(<16 x i16>* %a) #0 {
+define void @bitreverse_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op)
   store <16 x i16> %res, <16 x i16>* %a
@@ -137,80 +140,91 @@ define void @bitreverse_v16i16(<16 x i16>* %a) #0 {
 }
 
 define void @bitreverse_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: bitreverse_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: bitreverse_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    rbit z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    rbit z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: bitreverse_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    rbit z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %op)
   store <32 x i16> %res, <32 x i16>* %a
   ret void
 }
 
-define void @bitreverse_v64i16(<64 x i16>* %a) #0 {
+define void @bitreverse_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: bitreverse_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call <64 x i16> @llvm.bitreverse.v64i16(<64 x i16> %op)
   store <64 x i16> %res, <64 x i16>* %a
   ret void
 }
 
-define void @bitreverse_v128i16(<128 x i16>* %a) #0 {
+define void @bitreverse_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: bitreverse_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call <128 x i16> @llvm.bitreverse.v128i16(<128 x i16> %op)
   store <128 x i16> %res, <128 x i16>* %a
   ret void
 }
 
-define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) #0 {
+define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v2i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
-; CHECK-NEXT: rbit z0.s, [[PG]]/m, z0.s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
 
-define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) #0 {
+define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
-; CHECK-NEXT: rbit z0.s, [[PG]]/m, z0.s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
 
-define void @bitreverse_v8i32(<8 x i32>* %a) #0 {
+define void @bitreverse_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op)
   store <8 x i32> %res, <8 x i32>* %a
@@ -218,80 +232,91 @@ define void @bitreverse_v8i32(<8 x i32>* %a) #0 {
 }
 
 define void @bitreverse_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: bitreverse_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: bitreverse_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    rbit z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    rbit z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: bitreverse_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    rbit z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %op)
   store <16 x i32> %res, <16 x i32>* %a
   ret void
 }
 
-define void @bitreverse_v32i32(<32 x i32>* %a) #0 {
+define void @bitreverse_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: bitreverse_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call <32 x i32> @llvm.bitreverse.v32i32(<32 x i32> %op)
   store <32 x i32> %res, <32 x i32>* %a
   ret void
 }
 
-define void @bitreverse_v64i32(<64 x i32>* %a) #0 {
+define void @bitreverse_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: bitreverse_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call <64 x i32> @llvm.bitreverse.v64i32(<64 x i32> %op)
   store <64 x i32> %res, <64 x i32>* %a
   ret void
 }
 
-define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) #0 {
+define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v1i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
-; CHECK-NEXT: rbit z0.d, [[PG]]/m, z0.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
 
-define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) #0 {
+define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v2i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
-; CHECK-NEXT: rbit z0.d, [[PG]]/m, z0.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
 
-define void @bitreverse_v4i64(<4 x i64>* %a) #0 {
+define void @bitreverse_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op)
   store <4 x i64> %res, <4 x i64>* %a
@@ -299,49 +324,53 @@ define void @bitreverse_v4i64(<4 x i64>* %a) #0 {
 }
 
 define void @bitreverse_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: bitreverse_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: bitreverse_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    rbit z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    rbit z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: bitreverse_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    rbit z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %op)
   store <8 x i64> %res, <8 x i64>* %a
   ret void
 }
 
-define void @bitreverse_v16i64(<16 x i64>* %a) #0 {
+define void @bitreverse_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: bitreverse_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call <16 x i64> @llvm.bitreverse.v16i64(<16 x i64> %op)
   store <16 x i64> %res, <16 x i64>* %a
   ret void
 }
 
-define void @bitreverse_v32i64(<32 x i64>* %a) #0 {
+define void @bitreverse_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: bitreverse_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call <32 x i64> @llvm.bitreverse.v32i64(<32 x i64> %op)
   store <32 x i64> %res, <32 x i64>* %a
@@ -353,30 +382,33 @@ define void @bitreverse_v32i64(<32 x i64>* %a) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @bswap_v4i16(<4 x i16> %op) #0 {
+define <4 x i16> @bswap_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bswap_v4i16:
-; CHECK: rev16 v0.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-NEXT:    ret
   %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @bswap_v8i16(<8 x i16> %op) #0 {
+define <8 x i16> @bswap_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bswap_v8i16:
-; CHECK: rev16 v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-NEXT:    ret
   %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
 
-define void @bswap_v16i16(<16 x i16>* %a) #0 {
+define void @bswap_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bswap_v16i16:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x i16>, <16 x i16>* %a
   %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op)
   store <16 x i16> %res, <16 x i16>* %a
@@ -384,49 +416,53 @@ define void @bswap_v16i16(<16 x i16>* %a) #0 {
 }
 
 define void @bswap_v32i16(<32 x i16>* %a) #0 {
-; CHECK-LABEL: bswap_v32i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: revb [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
-; VBITS_EQ_256-DAG: revb [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
-; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: bswap_v32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    revb z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    revb z1.h, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: bswap_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    revb z0.h, p0/m, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, <32 x i16>* %a
   %res = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %op)
   store <32 x i16> %res, <32 x i16>* %a
   ret void
 }
 
-define void @bswap_v64i16(<64 x i16>* %a) #0 {
+define void @bswap_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: bswap_v64i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x i16>, <64 x i16>* %a
   %res = call <64 x i16> @llvm.bswap.v64i16(<64 x i16> %op)
   store <64 x i16> %res, <64 x i16>* %a
   ret void
 }
 
-define void @bswap_v128i16(<128 x i16>* %a) #0 {
+define void @bswap_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: bswap_v128i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
-; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <128 x i16>, <128 x i16>* %a
   %res = call <128 x i16> @llvm.bswap.v128i16(<128 x i16> %op)
   store <128 x i16> %res, <128 x i16>* %a
@@ -434,30 +470,33 @@ define void @bswap_v128i16(<128 x i16>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @bswap_v2i32(<2 x i32> %op) #0 {
+define <2 x i32> @bswap_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bswap_v2i32:
-; CHECK: rev32 v0.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-NEXT:    ret
   %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @bswap_v4i32(<4 x i32> %op) #0 {
+define <4 x i32> @bswap_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bswap_v4i32:
-; CHECK: rev32 v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.16b, v0.16b
+; CHECK-NEXT:    ret
   %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
 
-define void @bswap_v8i32(<8 x i32>* %a) #0 {
+define void @bswap_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bswap_v8i32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <8 x i32>, <8 x i32>* %a
   %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op)
   store <8 x i32> %res, <8 x i32>* %a
@@ -465,49 +504,53 @@ define void @bswap_v8i32(<8 x i32>* %a) #0 {
 }
 
 define void @bswap_v16i32(<16 x i32>* %a) #0 {
-; CHECK-LABEL: bswap_v16i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: revb [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
-; VBITS_EQ_256-DAG: revb [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
-; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: bswap_v16i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    revb z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    revb z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: bswap_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    revb z0.s, p0/m, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, <16 x i32>* %a
   %res = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %op)
   store <16 x i32> %res, <16 x i32>* %a
   ret void
 }
 
-define void @bswap_v32i32(<32 x i32>* %a) #0 {
+define void @bswap_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: bswap_v32i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i32>, <32 x i32>* %a
   %res = call <32 x i32> @llvm.bswap.v32i32(<32 x i32> %op)
   store <32 x i32> %res, <32 x i32>* %a
   ret void
 }
 
-define void @bswap_v64i32(<64 x i32>* %a) #0 {
+define void @bswap_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: bswap_v64i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
-; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <64 x i32>, <64 x i32>* %a
   %res = call <64 x i32> @llvm.bswap.v64i32(<64 x i32> %op)
   store <64 x i32> %res, <64 x i32>* %a
@@ -515,30 +558,33 @@ define void @bswap_v64i32(<64 x i32>* %a) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @bswap_v1i64(<1 x i64> %op) #0 {
+define <1 x i64> @bswap_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bswap_v1i64:
-; CHECK: rev64 v0.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.8b, v0.8b
+; CHECK-NEXT:    ret
   %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @bswap_v2i64(<2 x i64> %op) #0 {
+define <2 x i64> @bswap_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bswap_v2i64:
-; CHECK: rev64 v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.16b, v0.16b
+; CHECK-NEXT:    ret
   %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
 
-define void @bswap_v4i64(<4 x i64>* %a) #0 {
+define void @bswap_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bswap_v4i64:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a
   %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)
   store <4 x i64> %res, <4 x i64>* %a
@@ -546,49 +592,53 @@ define void @bswap_v4i64(<4 x i64>* %a) #0 {
 }
 
 define void @bswap_v8i64(<8 x i64>* %a) #0 {
-; CHECK-LABEL: bswap_v8i64:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: revb [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
-; VBITS_EQ_256-DAG: revb [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
-; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
-; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: bswap_v8i64:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    revb z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    revb z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: bswap_v8i64:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    revb z0.d, p0/m, z0.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op = load <8 x i64>, <8 x i64>* %a
   %res = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %op)
   store <8 x i64> %res, <8 x i64>* %a
   ret void
 }
 
-define void @bswap_v16i64(<16 x i64>* %a) #0 {
+define void @bswap_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: bswap_v16i64:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <16 x i64>, <16 x i64>* %a
   %res = call <16 x i64> @llvm.bswap.v16i64(<16 x i64> %op)
   store <16 x i64> %res, <16 x i64>* %a
   ret void
 }
 
-define void @bswap_v32i64(<32 x i64>* %a) #0 {
+define void @bswap_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: bswap_v32i64:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
-; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op = load <32 x i64>, <32 x i64>* %a
   %res = call <32 x i64> @llvm.bswap.v32i64(<32 x i64> %op)
   store <32 x i64> %res, <32 x i64>* %a
@@ -640,4 +690,3 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
 declare <8 x i64> @llvm.bswap.v8i64(<8 x i64>)
 declare <16 x i64> @llvm.bswap.v16i64(<16 x i64>)
 declare <32 x i64> @llvm.bswap.v32i64(<32 x i64>)
-

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
index 245f34248a54..e62cbcda9c7f 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
@@ -1,23 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) #0 {
+define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -29,7 +17,7 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) #0 {
   ret <8 x i8> %res
 }
 
-define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) #0 {
+define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -41,7 +29,7 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) #0 {
   ret <16 x i8> %res
 }
 
-define void @sdiv_v32i8(<32 x i8>* %a) #0 {
+define void @sdiv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b, vl32
@@ -81,91 +69,35 @@ define void @sdiv_v64i8(<64 x i8>* %a) #0 {
   ret void
 }
 
-define void @sdiv_v128i8(<128 x i8>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v128i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #96
-; VBITS_GE_256-NEXT:    mov w9, #32
-; VBITS_GE_256-NEXT:    mov w10, #64
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
-; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0, x9]
-; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0, x10]
-; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    asrd z1.b, p0/m, z1.b, #5
-; VBITS_GE_256-NEXT:    asrd z0.b, p0/m, z0.b, #5
-; VBITS_GE_256-NEXT:    asrd z2.b, p0/m, z2.b, #5
-; VBITS_GE_256-NEXT:    asrd z3.b, p0/m, z3.b, #5
-; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0, x10]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0, x9]
-; VBITS_GE_256-NEXT:    st1b { z3.b }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: sdiv_v128i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    asrd z0.b, p0/m, z0.b, #5
-; VBITS_GE_1024-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @sdiv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: sdiv_v128i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %res = sdiv <128 x i8> %op1, shufflevector (<128 x i8> insertelement (<128 x i8> poison, i8 32, i32 0), <128 x i8> poison, <128 x i32> zeroinitializer)
   store <128 x i8> %res, <128 x i8>* %a
   ret void
 }
 
-define void @sdiv_v256i8(<256 x i8>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v256i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #192
-; VBITS_GE_256-NEXT:    mov w9, #96
-; VBITS_GE_256-NEXT:    mov w10, #32
-; VBITS_GE_256-NEXT:    mov w11, #160
-; VBITS_GE_256-NEXT:    mov w12, #64
-; VBITS_GE_256-NEXT:    mov w13, #224
-; VBITS_GE_256-NEXT:    mov w14, #128
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x9]
-; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0, x10]
-; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0, x11]
-; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x0, x12]
-; VBITS_GE_256-NEXT:    ld1b { z4.b }, p0/z, [x0, x13]
-; VBITS_GE_256-NEXT:    ld1b { z5.b }, p0/z, [x0, x14]
-; VBITS_GE_256-NEXT:    ld1b { z6.b }, p0/z, [x0, x8]
-; VBITS_GE_256-NEXT:    ld1b { z7.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    asrd z1.b, p0/m, z1.b, #5
-; VBITS_GE_256-NEXT:    asrd z0.b, p0/m, z0.b, #5
-; VBITS_GE_256-NEXT:    asrd z3.b, p0/m, z3.b, #5
-; VBITS_GE_256-NEXT:    asrd z2.b, p0/m, z2.b, #5
-; VBITS_GE_256-NEXT:    asrd z5.b, p0/m, z5.b, #5
-; VBITS_GE_256-NEXT:    asrd z4.b, p0/m, z4.b, #5
-; VBITS_GE_256-NEXT:    asrd z6.b, p0/m, z6.b, #5
-; VBITS_GE_256-NEXT:    asrd z7.b, p0/m, z7.b, #5
-; VBITS_GE_256-NEXT:    st1b { z6.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z4.b }, p0, [x0, x13]
-; VBITS_GE_256-NEXT:    st1b { z5.b }, p0, [x0, x14]
-; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0, x11]
-; VBITS_GE_256-NEXT:    st1b { z3.b }, p0, [x0, x12]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x9]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0, x10]
-; VBITS_GE_256-NEXT:    st1b { z7.b }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: sdiv_v256i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    asrd z0.b, p0/m, z0.b, #5
-; VBITS_GE_2048-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @sdiv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: sdiv_v256i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %res = sdiv <256 x i8> %op1, shufflevector (<256 x i8> insertelement (<256 x i8> poison, i8 32, i32 0), <256 x i8> poison, <256 x i32> zeroinitializer)
   store <256 x i8> %res, <256 x i8>* %a
   ret void
 }
 
-define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) #0 {
+define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -177,7 +109,7 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) #0 {
   ret <4 x i16> %res
 }
 
-define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) #0 {
+define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -189,7 +121,7 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) #0 {
   ret <8 x i16> %res
 }
 
-define void @sdiv_v16i16(<16 x i16>* %a) #0 {
+define void @sdiv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v16i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -229,91 +161,35 @@ define void @sdiv_v32i16(<32 x i16>* %a) #0 {
   ret void
 }
 
-define void @sdiv_v64i16(<64 x i16>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v64i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #48
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #32
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    asrd z1.h, p0/m, z1.h, #5
-; VBITS_GE_256-NEXT:    asrd z0.h, p0/m, z0.h, #5
-; VBITS_GE_256-NEXT:    asrd z2.h, p0/m, z2.h, #5
-; VBITS_GE_256-NEXT:    asrd z3.h, p0/m, z3.h, #5
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: sdiv_v64i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    asrd z0.h, p0/m, z0.h, #5
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @sdiv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: sdiv_v64i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %res = sdiv <64 x i16> %op1, shufflevector (<64 x i16> insertelement (<64 x i16> poison, i16 32, i32 0), <64 x i16> poison, <64 x i32> zeroinitializer)
   store <64 x i16> %res, <64 x i16>* %a
   ret void
 }
 
-define void @sdiv_v128i16(<128 x i16>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v128i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #96
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x11, #80
-; VBITS_GE_256-NEXT:    mov x12, #32
-; VBITS_GE_256-NEXT:    mov x13, #112
-; VBITS_GE_256-NEXT:    mov x14, #64
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z6.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z7.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    asrd z1.h, p0/m, z1.h, #5
-; VBITS_GE_256-NEXT:    asrd z0.h, p0/m, z0.h, #5
-; VBITS_GE_256-NEXT:    asrd z3.h, p0/m, z3.h, #5
-; VBITS_GE_256-NEXT:    asrd z2.h, p0/m, z2.h, #5
-; VBITS_GE_256-NEXT:    asrd z5.h, p0/m, z5.h, #5
-; VBITS_GE_256-NEXT:    asrd z4.h, p0/m, z4.h, #5
-; VBITS_GE_256-NEXT:    asrd z6.h, p0/m, z6.h, #5
-; VBITS_GE_256-NEXT:    asrd z7.h, p0/m, z7.h, #5
-; VBITS_GE_256-NEXT:    st1h { z6.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z5.h }, p0, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z7.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: sdiv_v128i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    asrd z0.h, p0/m, z0.h, #5
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @sdiv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: sdiv_v128i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %res = sdiv <128 x i16> %op1, shufflevector (<128 x i16> insertelement (<128 x i16> poison, i16 32, i32 0), <128 x i16> poison, <128 x i32> zeroinitializer)
   store <128 x i16> %res, <128 x i16>* %a
   ret void
 }
 
-define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) #0 {
+define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -325,7 +201,7 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) #0 {
   ret <2 x i32> %res
 }
 
-define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) #0 {
+define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -337,7 +213,7 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) #0 {
   ret <4 x i32> %res
 }
 
-define void @sdiv_v8i32(<8 x i32>* %a) #0 {
+define void @sdiv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -377,91 +253,35 @@ define void @sdiv_v16i32(<16 x i32>* %a) #0 {
   ret void
 }
 
-define void @sdiv_v32i32(<32 x i32>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v32i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    asrd z1.s, p0/m, z1.s, #5
-; VBITS_GE_256-NEXT:    asrd z0.s, p0/m, z0.s, #5
-; VBITS_GE_256-NEXT:    asrd z2.s, p0/m, z2.s, #5
-; VBITS_GE_256-NEXT:    asrd z3.s, p0/m, z3.s, #5
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: sdiv_v32i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    asrd z0.s, p0/m, z0.s, #5
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @sdiv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: sdiv_v32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %res = sdiv <32 x i32> %op1, shufflevector (<32 x i32> insertelement (<32 x i32> poison, i32 32, i32 0), <32 x i32> poison, <32 x i32> zeroinitializer)
   store <32 x i32> %res, <32 x i32>* %a
   ret void
 }
 
-define void @sdiv_v64i32(<64 x i32>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v64i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #48
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    mov x11, #40
-; VBITS_GE_256-NEXT:    mov x12, #16
-; VBITS_GE_256-NEXT:    mov x13, #56
-; VBITS_GE_256-NEXT:    mov x14, #32
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    asrd z1.s, p0/m, z1.s, #5
-; VBITS_GE_256-NEXT:    asrd z0.s, p0/m, z0.s, #5
-; VBITS_GE_256-NEXT:    asrd z3.s, p0/m, z3.s, #5
-; VBITS_GE_256-NEXT:    asrd z2.s, p0/m, z2.s, #5
-; VBITS_GE_256-NEXT:    asrd z5.s, p0/m, z5.s, #5
-; VBITS_GE_256-NEXT:    asrd z4.s, p0/m, z4.s, #5
-; VBITS_GE_256-NEXT:    asrd z6.s, p0/m, z6.s, #5
-; VBITS_GE_256-NEXT:    asrd z7.s, p0/m, z7.s, #5
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: sdiv_v64i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    asrd z0.s, p0/m, z0.s, #5
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @sdiv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: sdiv_v64i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %res = sdiv <64 x i32> %op1, shufflevector (<64 x i32> insertelement (<64 x i32> poison, i32 32, i32 0), <64 x i32> poison, <64 x i32> zeroinitializer)
   store <64 x i32> %res, <64 x i32>* %a
   ret void
 }
 
-define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) #0 {
+define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -474,7 +294,7 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) #0 {
 }
 
 ; Vector i64 sdiv are not legal for NEON so use SVE when available.
-define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) #0 {
+define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -486,7 +306,7 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) #0 {
   ret <2 x i64> %res
 }
 
-define void @sdiv_v4i64(<4 x i64>* %a) #0 {
+define void @sdiv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -526,84 +346,28 @@ define void @sdiv_v8i64(<8 x i64>* %a) #0 {
   ret void
 }
 
-define void @sdiv_v16i64(<16 x i64>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v16i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    asrd z1.d, p0/m, z1.d, #5
-; VBITS_GE_256-NEXT:    asrd z0.d, p0/m, z0.d, #5
-; VBITS_GE_256-NEXT:    asrd z2.d, p0/m, z2.d, #5
-; VBITS_GE_256-NEXT:    asrd z3.d, p0/m, z3.d, #5
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: sdiv_v16i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    asrd z0.d, p0/m, z0.d, #5
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @sdiv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
+; CHECK-LABEL: sdiv_v16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %res = sdiv <16 x i64> %op1, shufflevector (<16 x i64> insertelement (<16 x i64> poison, i64 32, i32 0), <16 x i64> poison, <16 x i32> zeroinitializer)
   store <16 x i64> %res, <16 x i64>* %a
   ret void
 }
 
-define void @sdiv_v32i64(<32 x i64>* %a) #0 {
-; VBITS_GE_256-LABEL: sdiv_v32i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    mov x12, #8
-; VBITS_GE_256-NEXT:    mov x13, #28
-; VBITS_GE_256-NEXT:    mov x14, #16
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    asrd z1.d, p0/m, z1.d, #5
-; VBITS_GE_256-NEXT:    asrd z0.d, p0/m, z0.d, #5
-; VBITS_GE_256-NEXT:    asrd z3.d, p0/m, z3.d, #5
-; VBITS_GE_256-NEXT:    asrd z2.d, p0/m, z2.d, #5
-; VBITS_GE_256-NEXT:    asrd z5.d, p0/m, z5.d, #5
-; VBITS_GE_256-NEXT:    asrd z4.d, p0/m, z4.d, #5
-; VBITS_GE_256-NEXT:    asrd z6.d, p0/m, z6.d, #5
-; VBITS_GE_256-NEXT:    asrd z7.d, p0/m, z7.d, #5
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: sdiv_v32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    asrd z0.d, p0/m, z0.d, #5
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @sdiv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
+; CHECK-LABEL: sdiv_v32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %res = sdiv <32 x i64> %op1, shufflevector (<32 x i64> insertelement (<32 x i64> poison, i64 32, i32 0), <32 x i64> poison, <32 x i32> zeroinitializer)
   store <32 x i64> %res, <32 x i64>* %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
index 322ec0eb0110..33877e17c766 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; bigger than NEON. However, having no support opens us up to a code generator
 ; hang when expanding BUILD_VECTOR. Here we just validate the promblematic case
 ; successfully exits code generation.
-define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32> %b) #0 {
+define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32> %b) vscale_range(2,2) #0 {
 ; CHECK-LABEL: hang_when_merging_stores_after_legalisation:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
@@ -37,8 +37,8 @@ define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32
   ret void
 }
 
-; Ensure we don't crash when trying to lower a shuffle via and extract
-define void @crash_when_lowering_extract_shuffle(<32 x i32>* %dst, i1 %cond) #0 {
+; Ensure we don't crash when trying to lower a shuffle via an extract
+define void @crash_when_lowering_extract_shuffle(<32 x i32>* %dst, i1 %cond) vscale_range(2,2) #0 {
 ; CHECK-LABEL: crash_when_lowering_extract_shuffle:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tbnz w1, #0, .LBB1_2
@@ -132,4 +132,4 @@ exit:
   ret void
 }
 
-attributes #0 = { vscale_range(2,2) "target-features"="+sve" }
+attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
index c3af3f250b2b..50d0941cc52c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
@@ -1,21 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <8 x i8> @splat_v8i8(i8 %a) #0 {
+define <8 x i8> @splat_v8i8(i8 %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    dup v0.8b, w0
@@ -35,7 +21,7 @@ define <8 x i8> @splat_v8i8(i8 %a) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <16 x i8> @splat_v16i8(i8 %a) #0 {
+define <16 x i8> @splat_v16i8(i8 %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    dup v0.16b, w0
@@ -45,7 +31,7 @@ define <16 x i8> @splat_v16i8(i8 %a) #0 {
   ret <16 x i8> %splat
 }
 
-define void @splat_v32i8(i8 %a, <32 x i8>* %b) #0 {
+define void @splat_v32i8(i8 %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b, vl32
@@ -74,68 +60,32 @@ define void @splat_v64i8(i8 %a, <64 x i8>* %b) #0 {
 ; VBITS_GE_512-NEXT:    mov z0.b, w0
 ; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
-
-; Ensure sensible type legalisation.
   %insert = insertelement <64 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
   store <64 x i8> %splat, <64 x i8>* %b
   ret void
 }
 
-define void @splat_v128i8(i8 %a, <128 x i8>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v128i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #96
-; VBITS_GE_256-NEXT:    mov w9, #64
-; VBITS_GE_256-NEXT:    mov w10, #32
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    mov z0.b, w0
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1, x8]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1, x9]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1, x10]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: splat_v128i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT:    mov z0.b, w0
-; VBITS_GE_1024-NEXT:    st1b { z0.b }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @splat_v128i8(i8 %a, <128 x i8>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v128i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    mov z0.b, w0
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
   %insert = insertelement <128 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <128 x i8> %insert, <128 x i8> undef, <128 x i32> zeroinitializer
   store <128 x i8> %splat, <128 x i8>* %b
   ret void
 }
 
-define void @splat_v256i8(i8 %a, <256 x i8>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v256i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #224
-; VBITS_GE_256-NEXT:    mov w9, #192
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    mov z0.b, w0
-; VBITS_GE_256-NEXT:    mov w10, #160
-; VBITS_GE_256-NEXT:    mov w11, #128
-; VBITS_GE_256-NEXT:    mov w12, #96
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1, x8]
-; VBITS_GE_256-NEXT:    mov w8, #64
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1, x9]
-; VBITS_GE_256-NEXT:    mov w9, #32
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1, x10]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1, x11]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1, x12]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1, x8]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1, x9]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: splat_v256i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT:    mov z0.b, w0
-; VBITS_GE_2048-NEXT:    st1b { z0.b }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @splat_v256i8(i8 %a, <256 x i8>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v256i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    mov z0.b, w0
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
   %insert = insertelement <256 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <256 x i8> %insert, <256 x i8> undef, <256 x i32> zeroinitializer
   store <256 x i8> %splat, <256 x i8>* %b
@@ -143,7 +93,7 @@ define void @splat_v256i8(i8 %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x i16> @splat_v4i16(i16 %a) #0 {
+define <4 x i16> @splat_v4i16(i16 %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    dup v0.4h, w0
@@ -154,7 +104,7 @@ define <4 x i16> @splat_v4i16(i16 %a) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x i16> @splat_v8i16(i16 %a) #0 {
+define <8 x i16> @splat_v8i16(i16 %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    dup v0.8h, w0
@@ -164,7 +114,7 @@ define <8 x i16> @splat_v8i16(i16 %a) #0 {
   ret <8 x i16> %splat
 }
 
-define void @splat_v16i16(i16 %a, <16 x i16>* %b) #0 {
+define void @splat_v16i16(i16 %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v16i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -193,68 +143,32 @@ define void @splat_v32i16(i16 %a, <32 x i16>* %b) #0 {
 ; VBITS_GE_512-NEXT:    mov z0.h, w0
 ; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
-
-; Ensure sensible type legalisation.
   %insert = insertelement <32 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
   store <32 x i16> %splat, <32 x i16>* %b
   ret void
 }
 
-define void @splat_v64i16(i16 %a, <64 x i16>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v64i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #48
-; VBITS_GE_256-NEXT:    mov x9, #32
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov z0.h, w0
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: splat_v64i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    mov z0.h, w0
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @splat_v64i16(i16 %a, <64 x i16>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v64i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    mov z0.h, w0
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %insert = insertelement <64 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
   store <64 x i16> %splat, <64 x i16>* %b
   ret void
 }
 
-define void @splat_v128i16(i16 %a, <128 x i16>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v128i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #112
-; VBITS_GE_256-NEXT:    mov x9, #96
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov z0.h, w0
-; VBITS_GE_256-NEXT:    mov x10, #80
-; VBITS_GE_256-NEXT:    mov x11, #64
-; VBITS_GE_256-NEXT:    mov x12, #48
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    mov x8, #32
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: splat_v128i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    mov z0.h, w0
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @splat_v128i16(i16 %a, <128 x i16>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v128i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    mov z0.h, w0
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %insert = insertelement <128 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
   store <128 x i16> %splat, <128 x i16>* %b
@@ -262,7 +176,7 @@ define void @splat_v128i16(i16 %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x i32> @splat_v2i32(i32 %a) #0 {
+define <2 x i32> @splat_v2i32(i32 %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    dup v0.2s, w0
@@ -273,7 +187,7 @@ define <2 x i32> @splat_v2i32(i32 %a) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x i32> @splat_v4i32(i32 %a) #0 {
+define <4 x i32> @splat_v4i32(i32 %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    dup v0.4s, w0
@@ -283,7 +197,7 @@ define <4 x i32> @splat_v4i32(i32 %a) #0 {
   ret <4 x i32> %splat
 }
 
-define void @splat_v8i32(i32 %a, <8 x i32>* %b) #0 {
+define void @splat_v8i32(i32 %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -312,68 +226,32 @@ define void @splat_v16i32(i32 %a, <16 x i32>* %b) #0 {
 ; VBITS_GE_512-NEXT:    mov z0.s, w0
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
-
-; Ensure sensible type legalisation.
   %insert = insertelement <16 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
   store <16 x i32> %splat, <16 x i32>* %b
   ret void
 }
 
-define void @splat_v32i32(i32 %a, <32 x i32>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v32i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov z0.s, w0
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: splat_v32i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    mov z0.s, w0
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @splat_v32i32(i32 %a, <32 x i32>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    mov z0.s, w0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %insert = insertelement <32 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
   store <32 x i32> %splat, <32 x i32>* %b
   ret void
 }
 
-define void @splat_v64i32(i32 %a, <64 x i32>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v64i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #56
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov z0.s, w0
-; VBITS_GE_256-NEXT:    mov x10, #40
-; VBITS_GE_256-NEXT:    mov x11, #32
-; VBITS_GE_256-NEXT:    mov x12, #24
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: splat_v64i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    mov z0.s, w0
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @splat_v64i32(i32 %a, <64 x i32>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v64i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    mov z0.s, w0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %insert = insertelement <64 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
   store <64 x i32> %splat, <64 x i32>* %b
@@ -381,7 +259,7 @@ define void @splat_v64i32(i32 %a, <64 x i32>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x i64> @splat_v1i64(i64 %a) #0 {
+define <1 x i64> @splat_v1i64(i64 %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, x0
@@ -392,7 +270,7 @@ define <1 x i64> @splat_v1i64(i64 %a) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x i64> @splat_v2i64(i64 %a) #0 {
+define <2 x i64> @splat_v2i64(i64 %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    dup v0.2d, x0
@@ -402,7 +280,7 @@ define <2 x i64> @splat_v2i64(i64 %a) #0 {
   ret <2 x i64> %splat
 }
 
-define void @splat_v4i64(i64 %a, <4 x i64>* %b) #0 {
+define void @splat_v4i64(i64 %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -431,68 +309,32 @@ define void @splat_v8i64(i64 %a, <8 x i64>* %b) #0 {
 ; VBITS_GE_512-NEXT:    mov z0.d, x0
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
-
-; Ensure sensible type legalisation.
   %insert = insertelement <8 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
   store <8 x i64> %splat, <8 x i64>* %b
   ret void
 }
 
-define void @splat_v16i64(i64 %a, <16 x i64>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v16i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov z0.d, x0
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: splat_v16i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    mov z0.d, x0
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @splat_v16i64(i64 %a, <16 x i64>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    mov z0.d, x0
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %insert = insertelement <16 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
   store <16 x i64> %splat, <16 x i64>* %b
   ret void
 }
 
-define void @splat_v32i64(i64 %a, <32 x i64>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v32i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #28
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov z0.d, x0
-; VBITS_GE_256-NEXT:    mov x10, #20
-; VBITS_GE_256-NEXT:    mov x11, #16
-; VBITS_GE_256-NEXT:    mov x12, #12
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: splat_v32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    mov z0.d, x0
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @splat_v32i64(i64 %a, <32 x i64>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    mov z0.d, x0
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %insert = insertelement <32 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
   store <32 x i64> %splat, <32 x i64>* %b
@@ -504,7 +346,7 @@ define void @splat_v32i64(i64 %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
-define <4 x half> @splat_v4f16(half %a) #0 {
+define <4 x half> @splat_v4f16(half %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
@@ -516,7 +358,7 @@ define <4 x half> @splat_v4f16(half %a) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <8 x half> @splat_v8f16(half %a) #0 {
+define <8 x half> @splat_v8f16(half %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
@@ -527,7 +369,7 @@ define <8 x half> @splat_v8f16(half %a) #0 {
   ret <8 x half> %splat
 }
 
-define void @splat_v16f16(half %a, <16 x half>* %b) #0 {
+define void @splat_v16f16(half %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
@@ -559,72 +401,34 @@ define void @splat_v32f16(half %a, <32 x half>* %b) #0 {
 ; VBITS_GE_512-NEXT:    mov z0.h, h0
 ; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
-
-; Ensure sensible type legalisation.
   %insert = insertelement <32 x half> undef, half %a, i64 0
   %splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer
   store <32 x half> %splat, <32 x half>* %b
   ret void
 }
 
-define void @splat_v64f16(half %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v64f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #48
-; VBITS_GE_256-NEXT:    mov x9, #32
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov z0.h, h0
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: splat_v64f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    // kill: def $h0 killed $h0 def $z0
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    mov z0.h, h0
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @splat_v64f16(half %a, <64 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <64 x half> undef, half %a, i64 0
   %splat = shufflevector <64 x half> %insert, <64 x half> undef, <64 x i32> zeroinitializer
   store <64 x half> %splat, <64 x half>* %b
   ret void
 }
 
-define void @splat_v128f16(half %a, <128 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v128f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #112
-; VBITS_GE_256-NEXT:    mov x9, #96
-; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x10, #80
-; VBITS_GE_256-NEXT:    mov z0.h, h0
-; VBITS_GE_256-NEXT:    mov x11, #64
-; VBITS_GE_256-NEXT:    mov x12, #48
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    mov x8, #32
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: splat_v128f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    // kill: def $h0 killed $h0 def $z0
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    mov z0.h, h0
-; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @splat_v128f16(half %a, <128 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v128f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <128 x half> undef, half %a, i64 0
   %splat = shufflevector <128 x half> %insert, <128 x half> undef, <128 x i32> zeroinitializer
   store <128 x half> %splat, <128 x half>* %b
@@ -632,7 +436,7 @@ define void @splat_v128f16(half %a, <128 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) #0 {
+define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
@@ -644,7 +448,7 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) #0 {
+define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
@@ -655,7 +459,7 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) #0 {
   ret <4 x float> %splat
 }
 
-define void @splat_v8f32(float %a, <8 x float>* %b) #0 {
+define void @splat_v8f32(float %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
@@ -687,72 +491,34 @@ define void @splat_v16f32(float %a, <16 x float>* %b) #0 {
 ; VBITS_GE_512-NEXT:    mov z0.s, s0
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
-
-; Ensure sensible type legalisation.
   %insert = insertelement <16 x float> undef, float %a, i64 0
   %splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer
   store <16 x float> %splat, <16 x float>* %b
   ret void
 }
 
-define void @splat_v32f32(float %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov z0.s, s0
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: splat_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    // kill: def $s0 killed $s0 def $z0
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    mov z0.s, s0
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @splat_v32f32(float %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <32 x float> undef, float %a, i64 0
   %splat = shufflevector <32 x float> %insert, <32 x float> undef, <32 x i32> zeroinitializer
   store <32 x float> %splat, <32 x float>* %b
   ret void
 }
 
-define void @splat_v64f32(float %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v64f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #56
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x10, #40
-; VBITS_GE_256-NEXT:    mov z0.s, s0
-; VBITS_GE_256-NEXT:    mov x11, #32
-; VBITS_GE_256-NEXT:    mov x12, #24
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: splat_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    // kill: def $s0 killed $s0 def $z0
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    mov z0.s, s0
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @splat_v64f32(float %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <64 x float> undef, float %a, i64 0
   %splat = shufflevector <64 x float> %insert, <64 x float> undef, <64 x i32> zeroinitializer
   store <64 x float> %splat, <64 x float>* %b
@@ -760,7 +526,7 @@ define void @splat_v64f32(float %a, <64 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
-define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) #0 {
+define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
@@ -770,7 +536,7 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors.
-define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) #0 {
+define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
@@ -781,7 +547,7 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) #0 {
   ret <2 x double> %splat
 }
 
-define void @splat_v4f64(double %a, <4 x double>* %b) #0 {
+define void @splat_v4f64(double %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -813,72 +579,34 @@ define void @splat_v8f64(double %a, <8 x double>* %b) #0 {
 ; VBITS_GE_512-NEXT:    mov z0.d, d0
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
-
-; Ensure sensible type legalisation.
   %insert = insertelement <8 x double> undef, double %a, i64 0
   %splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer
   store <8 x double> %splat, <8 x double>* %b
   ret void
 }
 
-define void @splat_v16f64(double %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov z0.d, d0
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: splat_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    // kill: def $d0 killed $d0 def $z0
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    mov z0.d, d0
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @splat_v16f64(double %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: splat_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    mov z0.d, d0
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <16 x double> undef, double %a, i64 0
   %splat = shufflevector <16 x double> %insert, <16 x double> undef, <16 x i32> zeroinitializer
   store <16 x double> %splat, <16 x double>* %b
   ret void
 }
 
-define void @splat_v32f64(double %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: splat_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #28
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x10, #20
-; VBITS_GE_256-NEXT:    mov z0.d, d0
-; VBITS_GE_256-NEXT:    mov x11, #16
-; VBITS_GE_256-NEXT:    mov x12, #12
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: splat_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    // kill: def $d0 killed $d0 def $z0
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    mov z0.d, d0
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @splat_v32f64(double %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: splat_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    mov z0.d, d0
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <32 x double> undef, double %a, i64 0
   %splat = shufflevector <32 x double> %insert, <32 x double> undef, <32 x i32> zeroinitializer
   store <32 x double> %splat, <32 x double>* %b
@@ -889,88 +617,52 @@ define void @splat_v32f64(double %a, <32 x double>* %b) #0 {
 ; DUP (integer immediate)
 ;
 
-define void @splat_imm_v64i8(<64 x i8>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v64i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
-; VBITS_GE_256-NEXT:    mov z0.b, #1 // =0x1
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v64i8:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    mov z0.b, #1 // =0x1
-; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
-; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
+define void @splat_imm_v64i8(<64 x i8>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v64i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.b, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <64 x i8> undef, i8 1, i64 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
   store <64 x i8> %splat, <64 x i8>* %a
   ret void
 }
 
-define void @splat_imm_v32i16(<32 x i16>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v32i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov z0.h, #2 // =0x2
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v32i16:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    mov z0.h, #2 // =0x2
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
+define void @splat_imm_v32i16(<32 x i16>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v32i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #2 // =0x2
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <32 x i16> undef, i16 2, i64 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
   store <32 x i16> %splat, <32 x i16>* %a
   ret void
 }
 
-define void @splat_imm_v16i32(<16 x i32>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v16i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov z0.s, #3 // =0x3
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v16i32:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    mov z0.s, #3 // =0x3
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
+define void @splat_imm_v16i32(<16 x i32>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #3 // =0x3
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <16 x i32> undef, i32 3, i64 0
   %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
   store <16 x i32> %splat, <16 x i32>* %a
   ret void
 }
 
-define void @splat_imm_v8i64(<8 x i64>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v8i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov z0.d, #4 // =0x4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v8i64:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    mov z0.d, #4 // =0x4
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
+define void @splat_imm_v8i64(<8 x i64>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #4 // =0x4
+; CHECK-NEXT:    ptrue p0.d, vl8
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <8 x i64> undef, i64 4, i64 0
   %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
   store <8 x i64> %splat, <8 x i64>* %a
@@ -981,69 +673,43 @@ define void @splat_imm_v8i64(<8 x i64>* %a) #0 {
 ; DUP (floating-point immediate)
 ;
 
-define void @splat_imm_v32f16(<32 x half>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v32f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    fmov z0.h, #5.00000000
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v32f16:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    fmov z0.h, #5.00000000
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
+define void @splat_imm_v32f16(<32 x half>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.h, #5.00000000
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <32 x half> undef, half 5.0, i64 0
   %splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer
   store <32 x half> %splat, <32 x half>* %a
   ret void
 }
 
-define void @splat_imm_v16f32(<16 x float>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v16f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    fmov z0.s, #6.00000000
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v16f32:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    fmov z0.s, #6.00000000
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
+define void @splat_imm_v16f32(<16 x float>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.s, #6.00000000
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <16 x float> undef, float 6.0, i64 0
   %splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer
   store <16 x float> %splat, <16 x float>* %a
   ret void
 }
 
-define void @splat_imm_v8f64(<8 x double>* %a) #0 {
-; VBITS_GE_256-LABEL: splat_imm_v8f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    fmov z0.d, #7.00000000
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: splat_imm_v8f64:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    fmov z0.d, #7.00000000
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
+define void @splat_imm_v8f64(<8 x double>* %a) vscale_range(4,0) #0 {
+; CHECK-LABEL: splat_imm_v8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.d, #7.00000000
+; CHECK-NEXT:    ptrue p0.d, vl8
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %insert = insertelement <8 x double> undef, double 7.0, i64 0
   %splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer
   store <8 x double> %splat, <8 x double>* %a
   ret void
 }
+
 attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
index 225edeb5b2d9..0b4c08d79853 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
@@ -1,35 +1,29 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
-
-; VBYTES represents the useful byte size of a vector register from the code
-; generator's point of view. It is clamped to power-of-2 values because
-; only power-of-2 vector lengths are considered legal, regardless of the
-; user specified vector length.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ; Don't use SVE for 64-bit vectors.
 define void @store_v2f32(<2 x float>* %a) #0 {
 ; CHECK-LABEL: store_v2f32:
-; CHECK: str xzr, [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str xzr, [x0]
+; CHECK-NEXT:    ret
   store <2 x float> zeroinitializer, <2 x float>* %a
   ret void
 }
@@ -37,66 +31,148 @@ define void @store_v2f32(<2 x float>* %a) #0 {
 ; Don't use SVE for 128-bit vectors.
 define void @store_v4f32(<4 x float>* %a) #0 {
 ; CHECK-LABEL: store_v4f32:
-; CHECK: stp xzr, xzr, [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp xzr, xzr, [x0]
+; CHECK-NEXT:    ret
   store <4 x float> zeroinitializer, <4 x float>* %a
   ret void
 }
 
 define void @store_v8f32(<8 x float>* %a) #0 {
 ; CHECK-LABEL: store_v8f32:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
-; CHECK: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
   store <8 x float> zeroinitializer, <8 x float>* %a
   ret void
 }
 
 define void @store_v16f32(<16 x float>* %a) #0 {
-; CHECK-LABEL: store_v16f32:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
-; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
-; VBITS_LE_256-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
-; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A1]], lsl #2]
-; CHECK: ret
+; VBITS_GE_256-LABEL: store_v16f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: store_v16f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
+;
+; VBITS_GE_1024-LABEL: store_v16f32:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_1024-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
+;
+; VBITS_GE_2048-LABEL: store_v16f32:
+; VBITS_GE_2048:       // %bb.0:
+; VBITS_GE_2048-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_2048-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_2048-NEXT:    ret
   store <16 x float> zeroinitializer, <16 x float>* %a
   ret void
 }
 
 define void @store_v32f32(<32 x float>* %a) #0 {
-; CHECK-LABEL: store_v32f32:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
-; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
-; VBITS_LE_512-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
-; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A1]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]]
-; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A2]], lsl #2]
-; VBITS_LE_256-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]]
-; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A3]], lsl #2]
-; CHECK: ret
+; VBITS_GE_256-LABEL: store_v32f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #24
+; VBITS_GE_256-NEXT:    mov x9, #16
+; VBITS_GE_256-NEXT:    mov x10, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x10, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: store_v32f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    mov x8, #16
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_512-NEXT:    ret
+;
+; VBITS_GE_1024-LABEL: store_v32f32:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
+;
+; VBITS_GE_2048-LABEL: store_v32f32:
+; VBITS_GE_2048:       // %bb.0:
+; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_2048-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_2048-NEXT:    ret
   store <32 x float> zeroinitializer, <32 x float>* %a
   ret void
 }
 
 define void @store_v64f32(<64 x float>* %a) #0 {
-; CHECK-LABEL: store_v64f32:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
-; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
-; VBITS_LE_1024-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
-; VBITS_LE_1024-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A1]], lsl #2]
-; VBITS_LE_512-DAG:  mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]]
-; VBITS_LE_512-DAG:  st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A2]], lsl #2]
-; VBITS_LE_512-DAG:  mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]]
-; VBITS_LE_512-DAG:  st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A3]], lsl #2]
-; VBITS_LE_256-DAG:  mov x[[A4:[0-9]+]], #[[#mul(div(VBYTES,4),4)]]
-; VBITS_LE_256-DAG:  st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A4]], lsl #2]
-; VBITS_LE_256-DAG:  mov x[[A5:[0-9]+]], #[[#mul(div(VBYTES,4),5)]]
-; VBITS_LE_256-DAG:  st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A5]], lsl #2]
-; VBITS_LE_256-DAG:  mov x[[A6:[0-9]+]], #[[#mul(div(VBYTES,4),6)]]
-; VBITS_LE_256-DAG:  st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A6]], lsl #2]
-; VBITS_LE_256-DAG:  mov x[[A7:[0-9]+]], #[[#mul(div(VBYTES,4),7)]]
-; VBITS_LE_256-DAG:  st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A7]], lsl #2]
-; CHECK: ret
+; VBITS_GE_256-LABEL: store_v64f32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #56
+; VBITS_GE_256-NEXT:    mov x9, #48
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_256-NEXT:    mov x10, #40
+; VBITS_GE_256-NEXT:    mov x11, #32
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    mov x8, #24
+; VBITS_GE_256-NEXT:    mov x12, #16
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x10, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x11, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x12, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: store_v64f32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    mov x8, #48
+; VBITS_GE_512-NEXT:    mov x9, #32
+; VBITS_GE_512-NEXT:    mov x10, #16
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0, x10, lsl #2]
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
+;
+; VBITS_GE_1024-LABEL: store_v64f32:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    mov x8, #32
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_1024-NEXT:    ret
+;
+; VBITS_GE_2048-LABEL: store_v64f32:
+; VBITS_GE_2048:       // %bb.0:
+; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
+; VBITS_GE_2048-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_2048-NEXT:    ret
   store <64 x float> zeroinitializer, <64 x float>* %a
   ret void
 }

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
index 99a414f6b66c..2ce5a2d27989 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
@@ -1,21 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  -aarch64-enable-atomic-cfg-tidy=false < %s | not grep ptrue
 ; RUN: llc -aarch64-sve-vector-bits-min=256  -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 ; Test we can code generater patterns of the form:
 ;   fixed_length_vector = ISD::EXTRACT_SUBVECTOR scalable_vector, 0
@@ -28,7 +14,7 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-define void @subvector_v8i16(<8 x i16> *%in, <8 x i16>* %out) #0 {
+define void @subvector_v8i16(<8 x i16> *%in, <8 x i16>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: subvector_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -42,7 +28,7 @@ bb1:
   ret void
 }
 
-define void @subvector_v16i16(<16 x i16> *%in, <16 x i16>* %out) #0 {
+define void @subvector_v16i16(<16 x i16> *%in, <16 x i16>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: subvector_v16i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -82,29 +68,13 @@ bb1:
   ret void
 }
 
-define void @subvector_v64i16(<64 x i16> *%in, <64 x i16>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v64i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #48
-; VBITS_GE_256-NEXT:    mov x9, #32
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: subvector_v64i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @subvector_v64i16(<64 x i16> *%in, <64 x i16>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: subvector_v64i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x i16>, <64 x i16>* %in
   br label %bb1
 
@@ -113,7 +83,7 @@ bb1:
   ret void
 }
 
-define void @subvector_v8i32(<8 x i32> *%in, <8 x i32>* %out) #0 {
+define void @subvector_v8i32(<8 x i32> *%in, <8 x i32>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: subvector_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -153,29 +123,13 @@ bb1:
   ret void
 }
 
-define void @subvector_v32i32(<32 x i32> *%in, <32 x i32>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v32i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: subvector_v32i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @subvector_v32i32(<32 x i32> *%in, <32 x i32>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: subvector_v32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i32>, <32 x i32>* %in
   br label %bb1
 
@@ -184,41 +138,13 @@ bb1:
   ret void
 }
 
-define void @subvector_v64i32(<64 x i32> *%in, <64 x i32>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v64i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #56
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    mov x10, #40
-; VBITS_GE_256-NEXT:    mov x11, #32
-; VBITS_GE_256-NEXT:    mov x12, #24
-; VBITS_GE_256-NEXT:    mov x13, #16
-; VBITS_GE_256-NEXT:    mov x14, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: subvector_v64i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @subvector_v64i32(<64 x i32> *%in, <64 x i32>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: subvector_v64i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x i32>, <64 x i32>* %in
   br label %bb1
 
@@ -228,23 +154,16 @@ bb1:
 }
 
 
-define void @subvector_v8i64(<8 x i64> *%in, <8 x i64>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v8i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: subvector_v8i64:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_512-NEXT:    ret
+define void @subvector_v8i64(<8 x i64> *%in, <8 x i64>* %out) vscale_range(2,0) #0 {
+; CHECK-LABEL: subvector_v8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #4
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <8 x i64>, <8 x i64>* %in
   br label %bb1
 
@@ -253,29 +172,13 @@ bb1:
   ret void
 }
 
-define void @subvector_v16i64(<16 x i64> *%in, <16 x i64>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v16i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: subvector_v16i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @subvector_v16i64(<16 x i64> *%in, <16 x i64>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: subvector_v16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <16 x i64>, <16 x i64>* %in
   br label %bb1
 
@@ -284,41 +187,13 @@ bb1:
   ret void
 }
 
-define void @subvector_v32i64(<32 x i64> *%in, <32 x i64>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v32i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #28
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #20
-; VBITS_GE_256-NEXT:    mov x11, #16
-; VBITS_GE_256-NEXT:    mov x12, #12
-; VBITS_GE_256-NEXT:    mov x13, #8
-; VBITS_GE_256-NEXT:    mov x14, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: subvector_v32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @subvector_v32i64(<32 x i64> *%in, <32 x i64>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: subvector_v32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i64>, <32 x i64>* %in
   br label %bb1
 
@@ -327,7 +202,7 @@ bb1:
   ret void
 }
 
-define void @subvector_v8f16(<8 x half> *%in, <8 x half>* %out) #0 {
+define void @subvector_v8f16(<8 x half> *%in, <8 x half>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: subvector_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -341,7 +216,7 @@ bb1:
   ret void
 }
 
-define void @subvector_v16f16(<16 x half> *%in, <16 x half>* %out) #0 {
+define void @subvector_v16f16(<16 x half> *%in, <16 x half>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: subvector_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -381,29 +256,13 @@ bb1:
   ret void
 }
 
-define void @subvector_v64f16(<64 x half> *%in, <64 x half>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v64f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #48
-; VBITS_GE_256-NEXT:    mov x9, #32
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: subvector_v64f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @subvector_v64f16(<64 x half> *%in, <64 x half>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: subvector_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x half>, <64 x half>* %in
   br label %bb1
 
@@ -412,7 +271,7 @@ bb1:
   ret void
 }
 
-define void @subvector_v8f32(<8 x float> *%in, <8 x float>* %out) #0 {
+define void @subvector_v8f32(<8 x float> *%in, <8 x float>* %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: subvector_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -452,29 +311,13 @@ bb1:
   ret void
 }
 
-define void @subvector_v32f32(<32 x float> *%in, <32 x float>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: subvector_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @subvector_v32f32(<32 x float> *%in, <32 x float>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: subvector_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x float>, <32 x float>* %in
   br label %bb1
 
@@ -483,41 +326,13 @@ bb1:
   ret void
 }
 
-define void @subvector_v64f32(<64 x float> *%in, <64 x float>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v64f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #56
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    mov x10, #40
-; VBITS_GE_256-NEXT:    mov x11, #32
-; VBITS_GE_256-NEXT:    mov x12, #24
-; VBITS_GE_256-NEXT:    mov x13, #16
-; VBITS_GE_256-NEXT:    mov x14, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: subvector_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @subvector_v64f32(<64 x float> *%in, <64 x float>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: subvector_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x float>, <64 x float>* %in
   br label %bb1
 
@@ -550,29 +365,13 @@ bb1:
   ret void
 }
 
-define void @subvector_v16f64(<16 x double> *%in, <16 x double>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #12
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: subvector_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_1024-NEXT:    ret
+define void @subvector_v16f64(<16 x double> *%in, <16 x double>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: subvector_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <16 x double>, <16 x double>* %in
   br label %bb1
 
@@ -581,41 +380,13 @@ bb1:
   ret void
 }
 
-define void @subvector_v32f64(<32 x double> *%in, <32 x double>* %out) #0 {
-; VBITS_GE_256-LABEL: subvector_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #28
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #20
-; VBITS_GE_256-NEXT:    mov x11, #16
-; VBITS_GE_256-NEXT:    mov x12, #12
-; VBITS_GE_256-NEXT:    mov x13, #8
-; VBITS_GE_256-NEXT:    mov x14, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: subvector_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_2048-NEXT:    ret
+define void @subvector_v32f64(<32 x double> *%in, <32 x double>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: subvector_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x double>, <32 x double>* %in
   br label %bb1
 

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
index a32d5ce78f17..9b56968bbcb3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
@@ -1,43 +1,30 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_EQ_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
-define void @store_trunc_v2i64i8(<2 x i64>* %ap, <2 x i8>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v2i64i8
-; CHECK: ldr q[[Q0:[0-9]+]], [x0]
-; CHECK: ptrue p[[P0:[0-9]+]].d, vl2
-; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x1]
-; CHECK-NEXT: ret
+define void @store_trunc_v2i64i8(<2 x i64>* %ap, <2 x i8>* %dest) vscale_range(2,0) #0 {
+; CHECK-LABEL: store_trunc_v2i64i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    st1b { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <2 x i64>, <2 x i64>* %ap
   %val = trunc <2 x i64> %a to <2 x i8>
   store <2 x i8> %val, <2 x i8>* %dest
   ret void
 }
 
-define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v4i64i8
-; CHECK: ptrue p[[P0:[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
-; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x1]
-; CHECK-NEXT: ret
+define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) vscale_range(2,0) #0 {
+; CHECK-LABEL: store_trunc_v4i64i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <4 x i64>, <4 x i64>* %ap
   %val = trunc <4 x i64> %a to <4 x i8>
   store <4 x i8> %val, <4 x i8>* %dest
@@ -45,48 +32,52 @@ define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) #0 {
 }
 
 define void @store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i8>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v8i64i8:
-; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4
-; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s
-; VBITS_EQ_256-DAG: splice [[WORDS:z[0-9]+]].s, [[PG2]], [[WORDS_LO]].s, [[WORDS_HI]].s
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: st1b { [[WORDS]].s }, [[PG3]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: store_trunc_v8i64i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    st1b { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: store_trunc_v8i64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    st1b { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <8 x i64>, <8 x i64>* %ap
   %val = trunc <8 x i64> %a to <8 x i8>
   store <8 x i8> %val, <8 x i8>* %dest
   ret void
 }
 
-define void @store_trunc_v16i64i8(<16 x i64>* %ap, <16 x i8>* %dest) #0 {
+define void @store_trunc_v16i64i8(<16 x i64>* %ap, <16 x i8>* %dest) vscale_range(8,0) #0 {
 ; CHECK-LABEL: store_trunc_v16i64i8:
-; VBITS_GE_1024: ptrue p[[P0:[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1]
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <16 x i64>, <16 x i64>* %ap
   %val = trunc <16 x i64> %a to <16 x i8>
   store <16 x i8> %val, <16 x i8>* %dest
   ret void
 }
 
-define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) #0 {
+define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) vscale_range(16,0) #0 {
 ; CHECK-LABEL: store_trunc_v32i64i8:
-; VBITS_GE_2048: ptrue p[[P0:[0-9]+]].d, vl32
-; VBITS_GE_2048-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1]
-; VBITS_GE_2048-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i64>, <32 x i64>* %ap
   %val = trunc <32 x i64> %a to <32 x i8>
   store <32 x i8> %val, <32 x i8>* %dest
@@ -94,25 +85,27 @@ define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) #0 {
 }
 
 define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v8i64i16:
-; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P0]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
 ; Currently does not use the truncating store
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s
-; VBITS_EQ_256-DAG: uzp1 z[[HALFS_LO:[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h
-; VBITS_EQ_256-DAG: uzp1 z[[HALFS_HI:[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h
-; VBITS_EQ_256-NEXT: mov v[[HALFS_LO]].d[1], v[[HALFS_HI]].d[0]
-; VBITS_EQ_256-NEXT: str q[[HALFS_LO]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: store_trunc_v8i64i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    str q1, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: store_trunc_v8i64i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    st1h { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <8 x i64>, <8 x i64>* %ap
   %val = trunc <8 x i64> %a to <8 x i16>
   store <8 x i16> %val, <8 x i16>* %dest
@@ -120,24 +113,26 @@ define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 {
 }
 
 define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v8i64i32:
-; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P0]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
-; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #3]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4
-; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s
-; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s
-; VBITS_EQ_256-DAG: splice [[WORDS:z[0-9]+]].s, [[PG1]], [[WORDS_LO]].s, [[WORDS_HI]].s
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl8
-; VBITS_EQ_256-NEXT: st1w { [[WORDS]].s }, [[PG3]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: store_trunc_v8i64i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: store_trunc_v8i64i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    st1w { z0.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <8 x i64>, <8 x i64>* %ap
   %val = trunc <8 x i64> %a to <8 x i32>
   store <8 x i32> %val, <8 x i32>* %dest
@@ -145,25 +140,27 @@ define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 {
 }
 
 define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v16i32i8:
-; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P0]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation.
 ; Currently does not use the truncating store
-; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[WORDS_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[WORDS_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: uzp1 [[HALFS_LO:z[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h
-; VBITS_EQ_256-DAG: uzp1 [[HALFS_HI:z[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h
-; VBITS_EQ_256-DAG: uzp1 z[[BYTES_LO:[0-9]+]].b, [[HALFS_LO]].b, [[HALFS_LO]].b
-; VBITS_EQ_256-DAG: uzp1 z[[BYTES_HI:[0-9]+]].b, [[HALFS_HI]].b, [[HALFS_HI]].b
-; VBITS_EQ_256-NEXT: mov v[[BYTES_LO]].d[1], v[[BYTES_HI]].d[0]
-; VBITS_EQ_256-NEXT: str q[[BYTES_LO]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: store_trunc_v16i32i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    str q1, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: store_trunc_v16i32i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    st1b { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <16 x i32>, <16 x i32>* %ap
   %val = trunc <16 x i32> %a to <16 x i8>
   store <16 x i8> %val, <16 x i8>* %dest
@@ -171,24 +168,26 @@ define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 {
 }
 
 define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v16i32i16:
-; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P0]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
-; VBITS_EQ_256-DAG: ld1w { [[WORDS_LO:z[0-9]+]].s }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1w { [[WORDS_HI:z[0-9]+]].s }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #2]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].h, vl8
-; VBITS_EQ_256-DAG: uzp1 [[HALFS_LO:z[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h
-; VBITS_EQ_256-DAG: uzp1 [[HALFS_HI:z[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h
-; VBITS_EQ_256-DAG: splice [[HALFS:z[0-9]+]].h, [[PG2]], [[HALFS_LO]].h, [[HALFS_HI]].h
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl16
-; VBITS_EQ_256-NEXT: st1h { [[HALFS]].h }, [[PG3]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: store_trunc_v16i32i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: store_trunc_v16i32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    st1h { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <16 x i32>, <16 x i32>* %ap
   %val = trunc <16 x i32> %a to <16 x i16>
   store <16 x i16> %val, <16 x i16>* %dest
@@ -196,24 +195,26 @@ define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 {
 }
 
 define void @store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i8>* %dest) #0 {
-; CHECK-LABEL: store_trunc_v32i16i8:
-; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32
-; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
-; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P0]], [x1]
-; VBITS_GE_512-NEXT: ret
-
-; Ensure sensible type legalisation
-; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl16
-; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
-; VBITS_EQ_256-DAG: ld1h { [[HALFS_LO:z[0-9]+]].h }, [[PG1]]/z, [x0]
-; VBITS_EQ_256-DAG: ld1h { [[HALFS_HI:z[0-9]+]].h }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #1]
-; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].b, vl16
-; VBITS_EQ_256-DAG: uzp1 [[BYTES_LO:z[0-9]+]].b, [[HALFS_LO]].b, [[HALFS_LO]].b
-; VBITS_EQ_256-DAG: uzp1 [[BYTES_HI:z[0-9]+]].b, [[HALFS_HI]].b, [[HALFS_HI]].b
-; VBITS_EQ_256-DAG: splice [[BYTES:z[0-9]+]].b, [[PG2]], [[BYTES_LO]].b, [[BYTES_HI]].b
-; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].b, vl32
-; VBITS_EQ_256-NEXT: st1b { [[BYTES]].b }, [[PG3]], [x1]
-; VBITS_EQ_256-NEXT: ret
+; VBITS_GE_256-LABEL: store_trunc_v32i16i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    splice z1.b, p0, z1.b, z0.b
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: store_trunc_v32i16i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    st1b { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <32 x i16>, <32 x i16>* %ap
   %val = trunc <32 x i16> %a to <32 x i8>
   store <32 x i8> %val, <32 x i8>* %dest

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll
index 275716e06c23..8850bd4e84cb 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll
@@ -1,35 +1,22 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: z{0-9}
-
 ;
 ; truncate i16 -> i8
 ;
 
-define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) #0 {
+define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) vscale_range(2,0) #0 {
 ; CHECK-LABEL: trunc_v16i16_v16i8:
-; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
-; CHECK-NEXT: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0]
-; CHECK-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %a = load <16 x i16>, <16 x i16>* %in
   %b = trunc <16 x i16> %a to <16 x i8>
   ret <16 x i8> %b
@@ -37,11 +24,30 @@ define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) #0 {
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
 define void @trunc_v32i16_v32i8(<32 x i16>* %in, <32 x i8>* %out) #0 {
-; CHECK-LABEL: trunc_v32i16_v32i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
-; VBITS_GE_512: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_512: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_512: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
+; VBITS_GE_256-LABEL: trunc_v32i16_v32i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    splice z1.b, p0, z1.b, z0.b
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    add z0.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: trunc_v32i16_v32i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT:    add z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <32 x i16>, <32 x i16>* %in
   %b = trunc <32 x i16> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -50,12 +56,16 @@ define void @trunc_v32i16_v32i8(<32 x i16>* %in, <32 x i8>* %out) #0 {
 }
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) #0 {
+define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: trunc_v64i16_v64i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
-; VBITS_GE_1024: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_1024: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_1024: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x i16>, <64 x i16>* %in
   %b = trunc <64 x i16> %a to <64 x i8>
   %c = add <64 x i8> %b, %b
@@ -64,12 +74,16 @@ define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) #0 {
 }
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) #0 {
+define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: trunc_v128i16_v128i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
-; VBITS_GE_2048: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0]
-; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <128 x i16>, <128 x i16>* %in
   %b = trunc <128 x i16> %a to <128 x i8>
   %c = add <128 x i8> %b, %b
@@ -81,38 +95,60 @@ define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) #0 {
 ; truncate i32 -> i8
 ;
 
-define <8 x i8> @trunc_v8i32_v8i8(<8 x i32>* %in) #0 {
+define <8 x i8> @trunc_v8i32_v8i8(<8 x i32>* %in) vscale_range(2,0) #0 {
 ; CHECK-LABEL: trunc_v8i32_v8i8:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; CHECK-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %a = load <8 x i32>, <8 x i32>* %in
   %b = trunc <8 x i32> %a to <8 x i8>
   ret <8 x i8> %b
 }
 
 define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %in) #0 {
-; CHECK-LABEL: trunc_v16i32_v16i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_512-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: trunc_v16i32_v16i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    uzp1 z2.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z1.b, z1.b
+; VBITS_GE_256-NEXT:    mov v0.d[1], v2.d[0]
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: trunc_v16i32_v16i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %a = load <16 x i32>, <16 x i32>* %in
   %b = trunc <16 x i32> %a to <16 x i8>
   ret <16 x i8> %b
 }
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) #0 {
+define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: trunc_v32i32_v32i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_1024: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_1024: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i32>, <32 x i32>* %in
   %b = trunc <32 x i32> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -121,13 +157,17 @@ define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) #0 {
 }
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) #0 {
+define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: trunc_v64i32_v64i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x i32>, <64 x i32>* %in
   %b = trunc <64 x i32> %a to <64 x i8>
   %c = add <64 x i8> %b, %b
@@ -139,12 +179,14 @@ define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) #0 {
 ; truncate i32 -> i16
 ;
 
-define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) #0 {
+define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) vscale_range(2,0) #0 {
 ; CHECK-LABEL: trunc_v8i32_v8i16:
-; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
-; CHECK-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %a = load <8 x i32>, <8 x i32>* %in
   %b = trunc <8 x i32> %a to <8 x i16>
   ret <8 x i16> %b
@@ -152,11 +194,30 @@ define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) #0 {
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
 define void @trunc_v16i32_v16i16(<16 x i32>* %in, <16 x i16>* %out) #0 {
-; CHECK-LABEL: trunc_v16i32_v16i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
-; VBITS_GE_512: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_512: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_512: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
+; VBITS_GE_256-LABEL: trunc_v16i32_v16i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    add z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: trunc_v16i32_v16i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    add z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <16 x i32>, <16 x i32>* %in
   %b = trunc <16 x i32> %a to <16 x i16>
   %c = add <16 x i16> %b, %b
@@ -165,12 +226,16 @@ define void @trunc_v16i32_v16i16(<16 x i32>* %in, <16 x i16>* %out) #0 {
 }
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) #0 {
+define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: trunc_v32i32_v32i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
-; VBITS_GE_1024: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_1024: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i32>, <32 x i32>* %in
   %b = trunc <32 x i32> %a to <32 x i16>
   %c = add <32 x i16> %b, %b
@@ -179,12 +244,16 @@ define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) #0 {
 }
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) #0 {
+define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: trunc_v64i32_v64i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
-; VBITS_GE_2048: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
-; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_2048: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <64 x i32>, <64 x i32>* %in
   %b = trunc <64 x i32> %a to <64 x i16>
   %c = add <64 x i16> %b, %b
@@ -197,53 +266,78 @@ define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) #0 {
 ;
 
 ; NOTE: v4i8 is not legal so result i8 elements are held within i16 containers.
-define <4 x i8> @trunc_v4i64_v4i8(<4 x i64>* %in) #0 {
+define <4 x i8> @trunc_v4i64_v4i8(<4 x i64>* %in) vscale_range(2,0) #0 {
 ; CHECK-LABEL: trunc_v4i64_v4i8:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %a = load <4 x i64>, <4 x i64>* %in
   %b = trunc <4 x i64> %a to <4 x i8>
   ret <4 x i8> %b
 }
 
 define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %in) #0 {
-; CHECK-LABEL: trunc_v8i64_v8i8:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_512-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_512-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: trunc_v8i64_v8i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: trunc_v8i64_v8i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %a = load <8 x i64>, <8 x i64>* %in
   %b = trunc <8 x i64> %a to <8 x i8>
   ret <8 x i8> %b
 }
 
-define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %in) #0 {
+define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %in) vscale_range(8,0) #0 {
 ; CHECK-LABEL: trunc_v16i64_v16i8:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_1024-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_1024-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_1024-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %a = load <16 x i64>, <16 x i64>* %in
   %b = trunc <16 x i64> %a to <16 x i8>
   ret <16 x i8> %b
 }
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) #0 {
+define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: trunc_v32i64_v32i8:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
-; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i64>, <32 x i64>* %in
   %b = trunc <32 x i64> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -255,38 +349,60 @@ define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) #0 {
 ; truncate i64 -> i16
 ;
 
-define <4 x i16> @trunc_v4i64_v4i16(<4 x i64>* %in) #0 {
+define <4 x i16> @trunc_v4i64_v4i16(<4 x i64>* %in) vscale_range(2,0) #0 {
 ; CHECK-LABEL: trunc_v4i64_v4i16:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
   %a = load <4 x i64>, <4 x i64>* %in
   %b = trunc <4 x i64> %a to <4 x i16>
   ret <4 x i16> %b
 }
 
 define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %in) #0 {
-; CHECK-LABEL: trunc_v8i64_v8i16:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_512-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_512-NEXT: ret
+; VBITS_GE_256-LABEL: trunc_v8i64_v8i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    mov v0.d[1], v2.d[0]
+; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: trunc_v8i64_v8i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    ret
   %a = load <8 x i64>, <8 x i64>* %in
   %b = trunc <8 x i64> %a to <8 x i16>
   ret <8 x i16> %b
 }
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) #0 {
+define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: trunc_v16i64_v16i16:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_1024: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <16 x i64>, <16 x i64>* %in
   %b = trunc <16 x i64> %a to <16 x i16>
   %c = add <16 x i16> %b, %b
@@ -295,13 +411,17 @@ define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) #0 {
 }
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) #0 {
+define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: trunc_v32i64_v32i16:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
-; VBITS_GE_2048: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i64>, <32 x i64>* %in
   %b = trunc <32 x i64> %a to <32 x i16>
   %c = add <32 x i16> %b, %b
@@ -313,12 +433,14 @@ define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) #0 {
 ; truncate i64 -> i32
 ;
 
-define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) #0 {
+define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) vscale_range(2,0) #0 {
 ; CHECK-LABEL: trunc_v4i64_v4i32:
-; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
-; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; CHECK-NEXT: uzp1 z0.s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
   %a = load <4 x i64>, <4 x i64>* %in
   %b = trunc <4 x i64> %a to <4 x i32>
   ret <4 x i32> %b
@@ -326,11 +448,30 @@ define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) #0 {
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
 define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 {
-; CHECK-LABEL: trunc_v8i64_v8i32:
-; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
-; VBITS_GE_512: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_512: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_512: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s
+; VBITS_GE_256-LABEL: trunc_v8i64_v8i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    add z0.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: trunc_v8i64_v8i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT:    add z0.s, z0.s, z0.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ret
   %a = load <8 x i64>, <8 x i64>* %in
   %b = trunc <8 x i64> %a to <8 x i32>
   %c = add <8 x i32> %b, %b
@@ -339,12 +480,16 @@ define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 {
 }
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) #0 {
+define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) vscale_range(8,0) #0 {
 ; CHECK-LABEL: trunc_v16i64_v16i32:
-; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
-; VBITS_GE_1024: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_1024: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_1024: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <16 x i64>, <16 x i64>* %in
   %b = trunc <16 x i64> %a to <16 x i32>
   %c = add <16 x i32> %b, %b
@@ -353,12 +498,16 @@ define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) #0 {
 }
 
 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
-define void @trunc_v32i64_v32i32(<32 x i64>* %in, <32 x i32>* %out) #0 {
+define void @trunc_v32i64_v32i32(<32 x i64>* %in, <32 x i32>* %out) vscale_range(16,0) #0 {
 ; CHECK-LABEL: trunc_v32i64_v32i32:
-; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
-; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
-; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
-; VBITS_GE_2048: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
   %a = load <32 x i64>, <32 x i64>* %in
   %b = trunc <32 x i64> %a to <32 x i32>
   %c = add <32 x i32> %b, %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
index 61156422b46b..fff1e502becb 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
@@ -1,26 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; Don't use SVE when its registers are no bigger than NEON.
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 
 target triple = "aarch64-unknown-linux-gnu"
 
 ; Don't use SVE for 64-bit vectors
-define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #7
@@ -30,7 +16,7 @@ define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors
-define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
+define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #15
@@ -40,7 +26,7 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
   ret <16 x i8> %ret
 }
 
-define void @shuffle_ext_byone_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+define void @shuffle_ext_byone_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b, vl32
@@ -62,7 +48,6 @@ define void @shuffle_ext_byone_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 }
 
 define void @shuffle_ext_byone_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i8:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov w8, #32
@@ -104,47 +89,18 @@ define void @shuffle_ext_byone_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v128i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #64
-; VBITS_GE_256-NEXT:    mov w10, #32
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    mov w9, #96
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x1, x8]
-; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x10]
-; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0, x9]
-; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1, x9]
-; VBITS_GE_256-NEXT:    ld1b { z4.b }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z5.b, z0.b[31]
-; VBITS_GE_256-NEXT:    fmov w11, s5
-; VBITS_GE_256-NEXT:    mov z5.b, z2.b[31]
-; VBITS_GE_256-NEXT:    mov z1.b, z1.b[31]
-; VBITS_GE_256-NEXT:    fmov w12, s5
-; VBITS_GE_256-NEXT:    mov z5.b, z4.b[31]
-; VBITS_GE_256-NEXT:    fmov w13, s1
-; VBITS_GE_256-NEXT:    fmov w14, s5
-; VBITS_GE_256-NEXT:    insr z3.b, w11
-; VBITS_GE_256-NEXT:    insr z0.b, w12
-; VBITS_GE_256-NEXT:    st1b { z3.b }, p0, [x0, x9]
-; VBITS_GE_256-NEXT:    insr z4.b, w13
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    insr z2.b, w14
-; VBITS_GE_256-NEXT:    st1b { z4.b }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0, x10]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v128i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.b, vl128
-; VBITS_GE_1024-NEXT:    mov w8, #127
-; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    whilels p1.b, xzr, x8
-; VBITS_GE_1024-NEXT:    lastb w8, p1, z0.b
-; VBITS_GE_1024-NEXT:    insr z1.b, w8
-; VBITS_GE_1024-NEXT:    st1b { z1.b }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @shuffle_ext_byone_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v128i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    mov w8, #127
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.b, xzr, x8
+; CHECK-NEXT:    lastb w8, p1, z0.b
+; CHECK-NEXT:    insr z1.b, w8
+; CHECK-NEXT:    st1b { z1.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
   %ret = shufflevector <128 x i8> %op1, <128 x i8> %op2, <128 x i32> <i32 127,  i32 128,  i32 129,  i32 130,  i32 131,  i32 132,  i32 133,  i32 134,
@@ -167,71 +123,18 @@ define void @shuffle_ext_byone_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v256i8:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    mov w11, #128
-; VBITS_GE_256-NEXT:    mov w13, #64
-; VBITS_GE_256-NEXT:    mov w12, #96
-; VBITS_GE_256-NEXT:    mov w14, #160
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x1, x8]
-; VBITS_GE_256-NEXT:    mov w10, #192
-; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1, x11]
-; VBITS_GE_256-NEXT:    ld1b { z5.b }, p0/z, [x1, x13]
-; VBITS_GE_256-NEXT:    mov w9, #224
-; VBITS_GE_256-NEXT:    ld1b { z7.b }, p0/z, [x1, x12]
-; VBITS_GE_256-NEXT:    ld1b { z4.b }, p0/z, [x1, x10]
-; VBITS_GE_256-NEXT:    mov z6.b, z0.b[31]
-; VBITS_GE_256-NEXT:    fmov w15, s6
-; VBITS_GE_256-NEXT:    ld1b { z6.b }, p0/z, [x1, x14]
-; VBITS_GE_256-NEXT:    mov z16.b, z3.b[31]
-; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0, x9]
-; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x9]
-; VBITS_GE_256-NEXT:    ld1b { z17.b }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fmov w16, s16
-; VBITS_GE_256-NEXT:    mov z16.b, z5.b[31]
-; VBITS_GE_256-NEXT:    insr z5.b, w15
-; VBITS_GE_256-NEXT:    fmov w15, s16
-; VBITS_GE_256-NEXT:    mov z16.b, z7.b[31]
-; VBITS_GE_256-NEXT:    mov z1.b, z1.b[31]
-; VBITS_GE_256-NEXT:    fmov w17, s16
-; VBITS_GE_256-NEXT:    mov z16.b, z6.b[31]
-; VBITS_GE_256-NEXT:    fmov w18, s16
-; VBITS_GE_256-NEXT:    mov z16.b, z4.b[31]
-; VBITS_GE_256-NEXT:    insr z7.b, w15
-; VBITS_GE_256-NEXT:    fmov w15, s16
-; VBITS_GE_256-NEXT:    mov z16.b, z17.b[31]
-; VBITS_GE_256-NEXT:    fmov w1, s1
-; VBITS_GE_256-NEXT:    fmov w2, s16
-; VBITS_GE_256-NEXT:    insr z3.b, w17
-; VBITS_GE_256-NEXT:    insr z6.b, w16
-; VBITS_GE_256-NEXT:    insr z4.b, w18
-; VBITS_GE_256-NEXT:    insr z2.b, w15
-; VBITS_GE_256-NEXT:    insr z17.b, w1
-; VBITS_GE_256-NEXT:    insr z0.b, w2
-; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0, x9]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z4.b }, p0, [x0, x10]
-; VBITS_GE_256-NEXT:    st1b { z3.b }, p0, [x0, x11]
-; VBITS_GE_256-NEXT:    st1b { z7.b }, p0, [x0, x12]
-; VBITS_GE_256-NEXT:    st1b { z5.b }, p0, [x0, x13]
-; VBITS_GE_256-NEXT:    st1b { z6.b }, p0, [x0, x14]
-; VBITS_GE_256-NEXT:    st1b { z17.b }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v256i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.b, vl256
-; VBITS_GE_2048-NEXT:    mov w8, #255
-; VBITS_GE_2048-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    whilels p1.b, xzr, x8
-; VBITS_GE_2048-NEXT:    lastb w8, p1, z0.b
-; VBITS_GE_2048-NEXT:    insr z1.b, w8
-; VBITS_GE_2048-NEXT:    st1b { z1.b }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @shuffle_ext_byone_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v256i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    mov w8, #255
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.b, xzr, x8
+; CHECK-NEXT:    lastb w8, p1, z0.b
+; CHECK-NEXT:    insr z1.b, w8
+; CHECK-NEXT:    st1b { z1.b }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
   %ret = shufflevector <256 x i8> %op1, <256 x i8> %op2, <256 x i32> <i32 255,  i32 256,  i32 257,  i32 258,  i32 259,  i32 260,  i32 261,  i32 262,
@@ -271,7 +174,7 @@ define void @shuffle_ext_byone_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors
-define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #6
@@ -281,7 +184,7 @@ define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors
-define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
@@ -290,7 +193,7 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
   ret <8 x i16> %ret
 }
 
-define void @shuffle_ext_byone_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+define void @shuffle_ext_byone_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v16i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -310,7 +213,6 @@ define void @shuffle_ext_byone_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 }
 
 define void @shuffle_ext_byone_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #16
@@ -348,47 +250,18 @@ define void @shuffle_ext_byone_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #32
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z5.h, z0.h[15]
-; VBITS_GE_256-NEXT:    fmov w11, s5
-; VBITS_GE_256-NEXT:    mov z5.h, z2.h[15]
-; VBITS_GE_256-NEXT:    mov z1.h, z1.h[15]
-; VBITS_GE_256-NEXT:    fmov w12, s5
-; VBITS_GE_256-NEXT:    mov z5.h, z4.h[15]
-; VBITS_GE_256-NEXT:    fmov w13, s1
-; VBITS_GE_256-NEXT:    fmov w14, s5
-; VBITS_GE_256-NEXT:    insr z3.h, w11
-; VBITS_GE_256-NEXT:    insr z0.h, w12
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    insr z4.h, w13
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    insr z2.h, w14
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v64i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    mov w8, #63
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    whilels p1.h, xzr, x8
-; VBITS_GE_1024-NEXT:    lastb w8, p1, z0.h
-; VBITS_GE_1024-NEXT:    insr z1.h, w8
-; VBITS_GE_1024-NEXT:    st1h { z1.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @shuffle_ext_byone_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v64i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    mov w8, #63
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.h, xzr, x8
+; CHECK-NEXT:    lastb w8, p1, z0.h
+; CHECK-NEXT:    insr z1.h, w8
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
   %ret = shufflevector <64 x i16> %op1, <64 x i16> %op2, <64 x i32> <i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70,
@@ -403,71 +276,18 @@ define void @shuffle_ext_byone_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v128i16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x10, #64
-; VBITS_GE_256-NEXT:    mov x13, #32
-; VBITS_GE_256-NEXT:    mov x14, #48
-; VBITS_GE_256-NEXT:    mov x11, #80
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    mov x12, #96
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT:    mov x9, #112
-; VBITS_GE_256-NEXT:    ld1h { z7.h }, p0/z, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    mov z6.h, z0.h[15]
-; VBITS_GE_256-NEXT:    fmov w15, s6
-; VBITS_GE_256-NEXT:    ld1h { z6.h }, p0/z, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT:    mov z16.h, z2.h[15]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z17.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fmov w16, s16
-; VBITS_GE_256-NEXT:    mov z16.h, z5.h[15]
-; VBITS_GE_256-NEXT:    insr z5.h, w15
-; VBITS_GE_256-NEXT:    fmov w15, s16
-; VBITS_GE_256-NEXT:    mov z16.h, z7.h[15]
-; VBITS_GE_256-NEXT:    mov z1.h, z1.h[15]
-; VBITS_GE_256-NEXT:    fmov w17, s16
-; VBITS_GE_256-NEXT:    mov z16.h, z6.h[15]
-; VBITS_GE_256-NEXT:    fmov w18, s16
-; VBITS_GE_256-NEXT:    mov z16.h, z4.h[15]
-; VBITS_GE_256-NEXT:    insr z7.h, w15
-; VBITS_GE_256-NEXT:    fmov w15, s16
-; VBITS_GE_256-NEXT:    mov z16.h, z17.h[15]
-; VBITS_GE_256-NEXT:    fmov w1, s1
-; VBITS_GE_256-NEXT:    fmov w2, s16
-; VBITS_GE_256-NEXT:    insr z2.h, w17
-; VBITS_GE_256-NEXT:    insr z6.h, w16
-; VBITS_GE_256-NEXT:    insr z4.h, w18
-; VBITS_GE_256-NEXT:    insr z3.h, w15
-; VBITS_GE_256-NEXT:    insr z17.h, w1
-; VBITS_GE_256-NEXT:    insr z0.h, w2
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z7.h }, p0, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z5.h }, p0, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z6.h }, p0, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z17.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v128i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    mov w8, #127
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    whilels p1.h, xzr, x8
-; VBITS_GE_2048-NEXT:    lastb w8, p1, z0.h
-; VBITS_GE_2048-NEXT:    insr z1.h, w8
-; VBITS_GE_2048-NEXT:    st1h { z1.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @shuffle_ext_byone_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v128i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    mov w8, #127
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.h, xzr, x8
+; CHECK-NEXT:    lastb w8, p1, z0.h
+; CHECK-NEXT:    insr z1.h, w8
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
   %ret = shufflevector <128 x i16> %op1, <128 x i16> %op2, <128 x i32> <i32 127,  i32 128,  i32 129,  i32 130,  i32 131,  i32 132,  i32 133,  i32 134,
@@ -491,7 +311,7 @@ define void @shuffle_ext_byone_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors
-define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
@@ -501,7 +321,7 @@ define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors
-define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
@@ -510,7 +330,7 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
   ret <4 x i32> %ret
 }
 
-define void @shuffle_ext_byone_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+define void @shuffle_ext_byone_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -529,7 +349,6 @@ define void @shuffle_ext_byone_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 }
 
 define void @shuffle_ext_byone_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v16i32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #8
@@ -565,47 +384,18 @@ define void @shuffle_ext_byone_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z5.s, z0.s[7]
-; VBITS_GE_256-NEXT:    fmov w11, s5
-; VBITS_GE_256-NEXT:    mov z5.s, z2.s[7]
-; VBITS_GE_256-NEXT:    mov z1.s, z1.s[7]
-; VBITS_GE_256-NEXT:    fmov w12, s5
-; VBITS_GE_256-NEXT:    mov z5.s, z4.s[7]
-; VBITS_GE_256-NEXT:    fmov w13, s1
-; VBITS_GE_256-NEXT:    fmov w14, s5
-; VBITS_GE_256-NEXT:    insr z3.s, w11
-; VBITS_GE_256-NEXT:    insr z0.s, w12
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    insr z4.s, w13
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    insr z2.s, w14
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v32i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    mov w8, #31
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    whilels p1.s, xzr, x8
-; VBITS_GE_1024-NEXT:    lastb w8, p1, z0.s
-; VBITS_GE_1024-NEXT:    insr z1.s, w8
-; VBITS_GE_1024-NEXT:    st1w { z1.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @shuffle_ext_byone_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    mov w8, #31
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.s, xzr, x8
+; CHECK-NEXT:    lastb w8, p1, z0.s
+; CHECK-NEXT:    insr z1.s, w8
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
   %ret = shufflevector <32 x i32> %op1, <32 x i32> %op2, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,
@@ -616,71 +406,18 @@ define void @shuffle_ext_byone_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x10, #32
-; VBITS_GE_256-NEXT:    mov x13, #16
-; VBITS_GE_256-NEXT:    mov x14, #24
-; VBITS_GE_256-NEXT:    mov x11, #40
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x12, #48
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT:    mov x9, #56
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    mov z6.s, z0.s[7]
-; VBITS_GE_256-NEXT:    fmov w15, s6
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    mov z16.s, z2.s[7]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z17.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fmov w16, s16
-; VBITS_GE_256-NEXT:    mov z16.s, z5.s[7]
-; VBITS_GE_256-NEXT:    insr z5.s, w15
-; VBITS_GE_256-NEXT:    fmov w15, s16
-; VBITS_GE_256-NEXT:    mov z16.s, z7.s[7]
-; VBITS_GE_256-NEXT:    mov z1.s, z1.s[7]
-; VBITS_GE_256-NEXT:    fmov w17, s16
-; VBITS_GE_256-NEXT:    mov z16.s, z6.s[7]
-; VBITS_GE_256-NEXT:    fmov w18, s16
-; VBITS_GE_256-NEXT:    mov z16.s, z4.s[7]
-; VBITS_GE_256-NEXT:    insr z7.s, w15
-; VBITS_GE_256-NEXT:    fmov w15, s16
-; VBITS_GE_256-NEXT:    mov z16.s, z17.s[7]
-; VBITS_GE_256-NEXT:    fmov w1, s1
-; VBITS_GE_256-NEXT:    fmov w2, s16
-; VBITS_GE_256-NEXT:    insr z2.s, w17
-; VBITS_GE_256-NEXT:    insr z6.s, w16
-; VBITS_GE_256-NEXT:    insr z4.s, w18
-; VBITS_GE_256-NEXT:    insr z3.s, w15
-; VBITS_GE_256-NEXT:    insr z17.s, w1
-; VBITS_GE_256-NEXT:    insr z0.s, w2
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z17.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v64i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    mov w8, #63
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    whilels p1.s, xzr, x8
-; VBITS_GE_2048-NEXT:    lastb w8, p1, z0.s
-; VBITS_GE_2048-NEXT:    insr z1.s, w8
-; VBITS_GE_2048-NEXT:    st1w { z1.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @shuffle_ext_byone_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v64i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    mov w8, #63
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.s, xzr, x8
+; CHECK-NEXT:    lastb w8, p1, z0.s
+; CHECK-NEXT:    insr z1.s, w8
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
   %ret = shufflevector <64 x i32> %op1, <64 x i32> %op2, <64 x i32> <i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70,
@@ -696,7 +433,7 @@ define void @shuffle_ext_byone_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors
-define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
@@ -705,7 +442,7 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
   ret <2 x i64> %ret
 }
 
-define void @shuffle_ext_byone_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+define void @shuffle_ext_byone_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -724,7 +461,6 @@ define void @shuffle_ext_byone_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 }
 
 define void @shuffle_ext_byone_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v8i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #4
@@ -759,47 +495,18 @@ define void @shuffle_ext_byone_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v16i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z5.d, z0.d[3]
-; VBITS_GE_256-NEXT:    fmov x11, d5
-; VBITS_GE_256-NEXT:    mov z5.d, z2.d[3]
-; VBITS_GE_256-NEXT:    mov z1.d, z1.d[3]
-; VBITS_GE_256-NEXT:    fmov x12, d5
-; VBITS_GE_256-NEXT:    mov z5.d, z4.d[3]
-; VBITS_GE_256-NEXT:    fmov x13, d1
-; VBITS_GE_256-NEXT:    fmov x14, d5
-; VBITS_GE_256-NEXT:    insr z3.d, x11
-; VBITS_GE_256-NEXT:    insr z0.d, x12
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    insr z4.d, x13
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    insr z2.d, x14
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v16i64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    mov w8, #15
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    whilels p1.d, xzr, x8
-; VBITS_GE_1024-NEXT:    lastb x8, p1, z0.d
-; VBITS_GE_1024-NEXT:    insr z1.d, x8
-; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @shuffle_ext_byone_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    mov w8, #15
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.d, xzr, x8
+; CHECK-NEXT:    lastb x8, p1, z0.d
+; CHECK-NEXT:    insr z1.d, x8
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
   %ret = shufflevector <16 x i64> %op1, <16 x i64> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
@@ -808,71 +515,18 @@ define void @shuffle_ext_byone_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x13, #8
-; VBITS_GE_256-NEXT:    mov x14, #12
-; VBITS_GE_256-NEXT:    mov x11, #20
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov x12, #24
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    mov x9, #28
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    mov z6.d, z0.d[3]
-; VBITS_GE_256-NEXT:    fmov x15, d6
-; VBITS_GE_256-NEXT:    ld1d { z6.d }, p0/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    mov z16.d, z2.d[3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fmov x16, d16
-; VBITS_GE_256-NEXT:    mov z16.d, z5.d[3]
-; VBITS_GE_256-NEXT:    insr z5.d, x15
-; VBITS_GE_256-NEXT:    fmov x15, d16
-; VBITS_GE_256-NEXT:    mov z16.d, z7.d[3]
-; VBITS_GE_256-NEXT:    mov z1.d, z1.d[3]
-; VBITS_GE_256-NEXT:    fmov x17, d16
-; VBITS_GE_256-NEXT:    mov z16.d, z6.d[3]
-; VBITS_GE_256-NEXT:    fmov x18, d16
-; VBITS_GE_256-NEXT:    mov z16.d, z4.d[3]
-; VBITS_GE_256-NEXT:    insr z7.d, x15
-; VBITS_GE_256-NEXT:    fmov x15, d16
-; VBITS_GE_256-NEXT:    mov z16.d, z17.d[3]
-; VBITS_GE_256-NEXT:    fmov x1, d1
-; VBITS_GE_256-NEXT:    fmov x2, d16
-; VBITS_GE_256-NEXT:    insr z2.d, x17
-; VBITS_GE_256-NEXT:    insr z6.d, x16
-; VBITS_GE_256-NEXT:    insr z4.d, x18
-; VBITS_GE_256-NEXT:    insr z3.d, x15
-; VBITS_GE_256-NEXT:    insr z17.d, x1
-; VBITS_GE_256-NEXT:    insr z0.d, x2
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z6.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z17.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v32i64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    mov w8, #31
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    whilels p1.d, xzr, x8
-; VBITS_GE_2048-NEXT:    lastb x8, p1, z0.d
-; VBITS_GE_2048-NEXT:    insr z1.d, x8
-; VBITS_GE_2048-NEXT:    st1d { z1.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @shuffle_ext_byone_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v32i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    mov w8, #31
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.d, xzr, x8
+; CHECK-NEXT:    lastb x8, p1, z0.d
+; CHECK-NEXT:    insr z1.d, x8
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
   %ret = shufflevector <32 x i64> %op1, <32 x i64> %op2, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,
@@ -884,7 +538,7 @@ define void @shuffle_ext_byone_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors
-define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
+define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #6
@@ -894,7 +548,7 @@ define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) #0
 }
 
 ; Don't use SVE for 128-bit vectors
-define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
+define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
@@ -903,7 +557,7 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) #0
   ret <8 x half> %ret
 }
 
-define void @shuffle_ext_byone_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
+define void @shuffle_ext_byone_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
@@ -922,7 +576,6 @@ define void @shuffle_ext_byone_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 }
 
 define void @shuffle_ext_byone_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #16
@@ -957,43 +610,18 @@ define void @shuffle_ext_byone_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v64f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #32
-; VBITS_GE_256-NEXT:    mov x9, #48
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z5.h, z0.h[15]
-; VBITS_GE_256-NEXT:    insr z1.h, h5
-; VBITS_GE_256-NEXT:    mov z5.h, z3.h[15]
-; VBITS_GE_256-NEXT:    mov z2.h, z2.h[15]
-; VBITS_GE_256-NEXT:    insr z0.h, h5
-; VBITS_GE_256-NEXT:    mov z5.h, z4.h[15]
-; VBITS_GE_256-NEXT:    insr z4.h, h2
-; VBITS_GE_256-NEXT:    insr z3.h, h5
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v64f16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    mov w8, #63
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    whilels p1.h, xzr, x8
-; VBITS_GE_1024-NEXT:    lastb h0, p1, z0.h
-; VBITS_GE_1024-NEXT:    insr z1.h, h0
-; VBITS_GE_1024-NEXT:    st1h { z1.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @shuffle_ext_byone_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v64f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    mov w8, #63
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.h, xzr, x8
+; CHECK-NEXT:    lastb h0, p1, z0.h
+; CHECK-NEXT:    insr z1.h, h0
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x half>, <64 x half>* %a
   %op2 = load <64 x half>, <64 x half>* %b
   %ret = shufflevector <64 x half> %op1, <64 x half> %op2, <64 x i32> <i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70,
@@ -1008,63 +636,18 @@ define void @shuffle_ext_byone_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v128f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x10, #64
-; VBITS_GE_256-NEXT:    mov x9, #80
-; VBITS_GE_256-NEXT:    mov x11, #16
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x12, #32
-; VBITS_GE_256-NEXT:    mov x13, #48
-; VBITS_GE_256-NEXT:    mov x8, #112
-; VBITS_GE_256-NEXT:    mov x14, #96
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1, x10, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x1, x11, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1, x12, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z16.h }, p0/z, [x1, x13, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z7.h }, p0/z, [x1, x14, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z17.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z18.h, z3.h[15]
-; VBITS_GE_256-NEXT:    mov z6.h, z1.h[15]
-; VBITS_GE_256-NEXT:    insr z1.h, h18
-; VBITS_GE_256-NEXT:    mov z18.h, z5.h[15]
-; VBITS_GE_256-NEXT:    mov z19.h, z4.h[15]
-; VBITS_GE_256-NEXT:    insr z4.h, h18
-; VBITS_GE_256-NEXT:    mov z18.h, z16.h[15]
-; VBITS_GE_256-NEXT:    insr z3.h, h18
-; VBITS_GE_256-NEXT:    mov z18.h, z7.h[15]
-; VBITS_GE_256-NEXT:    insr z7.h, h6
-; VBITS_GE_256-NEXT:    mov z0.h, z0.h[15]
-; VBITS_GE_256-NEXT:    mov z6.h, z17.h[15]
-; VBITS_GE_256-NEXT:    insr z16.h, h19
-; VBITS_GE_256-NEXT:    insr z2.h, h18
-; VBITS_GE_256-NEXT:    insr z17.h, h0
-; VBITS_GE_256-NEXT:    insr z5.h, h6
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z7.h }, p0, [x0, x14, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0, x10, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z16.h }, p0, [x0, x13, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z4.h }, p0, [x0, x12, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z5.h }, p0, [x0, x11, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z17.h }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v128f16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    mov w8, #127
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    whilels p1.h, xzr, x8
-; VBITS_GE_2048-NEXT:    lastb h0, p1, z0.h
-; VBITS_GE_2048-NEXT:    insr z1.h, h0
-; VBITS_GE_2048-NEXT:    st1h { z1.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @shuffle_ext_byone_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v128f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    mov w8, #127
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.h, xzr, x8
+; CHECK-NEXT:    lastb h0, p1, z0.h
+; CHECK-NEXT:    insr z1.h, h0
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <128 x half>, <128 x half>* %a
   %op2 = load <128 x half>, <128 x half>* %b
   %ret = shufflevector <128 x half> %op1, <128 x half> %op2, <128 x i32> <i32 127,  i32 128,  i32 129,  i32 130,  i32 131,  i32 132,  i32 133,  i32 134,
@@ -1088,7 +671,7 @@ define void @shuffle_ext_byone_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors
-define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
+define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
@@ -1098,7 +681,7 @@ define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2)
 }
 
 ; Don't use SVE for 128-bit vectors
-define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
+define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
@@ -1107,7 +690,7 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %ret
 }
 
-define void @shuffle_ext_byone_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
+define void @shuffle_ext_byone_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
@@ -1125,7 +708,6 @@ define void @shuffle_ext_byone_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
 }
 
 define void @shuffle_ext_byone_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v16f32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #8
@@ -1158,43 +740,18 @@ define void @shuffle_ext_byone_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov x9, #24
-; VBITS_GE_256-NEXT:    mov x10, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z5.s, z0.s[7]
-; VBITS_GE_256-NEXT:    insr z1.s, s5
-; VBITS_GE_256-NEXT:    mov z5.s, z3.s[7]
-; VBITS_GE_256-NEXT:    mov z2.s, z2.s[7]
-; VBITS_GE_256-NEXT:    insr z0.s, s5
-; VBITS_GE_256-NEXT:    mov z5.s, z4.s[7]
-; VBITS_GE_256-NEXT:    insr z4.s, s2
-; VBITS_GE_256-NEXT:    insr z3.s, s5
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v32f32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    mov w8, #31
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    whilels p1.s, xzr, x8
-; VBITS_GE_1024-NEXT:    lastb s0, p1, z0.s
-; VBITS_GE_1024-NEXT:    insr z1.s, s0
-; VBITS_GE_1024-NEXT:    st1w { z1.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @shuffle_ext_byone_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    mov w8, #31
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.s, xzr, x8
+; CHECK-NEXT:    lastb s0, p1, z0.s
+; CHECK-NEXT:    insr z1.s, s0
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x float>, <32 x float>* %a
   %op2 = load <32 x float>, <32 x float>* %b
   %ret = shufflevector <32 x float> %op1, <32 x float> %op2, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,
@@ -1205,63 +762,18 @@ define void @shuffle_ext_byone_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v64f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x10, #32
-; VBITS_GE_256-NEXT:    mov x9, #40
-; VBITS_GE_256-NEXT:    mov x11, #8
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x12, #16
-; VBITS_GE_256-NEXT:    mov x13, #24
-; VBITS_GE_256-NEXT:    mov x8, #56
-; VBITS_GE_256-NEXT:    mov x14, #48
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x1, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x1, x14, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z17.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z18.s, z3.s[7]
-; VBITS_GE_256-NEXT:    mov z6.s, z1.s[7]
-; VBITS_GE_256-NEXT:    insr z1.s, s18
-; VBITS_GE_256-NEXT:    mov z18.s, z5.s[7]
-; VBITS_GE_256-NEXT:    mov z19.s, z4.s[7]
-; VBITS_GE_256-NEXT:    insr z4.s, s18
-; VBITS_GE_256-NEXT:    mov z18.s, z16.s[7]
-; VBITS_GE_256-NEXT:    insr z3.s, s18
-; VBITS_GE_256-NEXT:    mov z18.s, z7.s[7]
-; VBITS_GE_256-NEXT:    insr z7.s, s6
-; VBITS_GE_256-NEXT:    mov z0.s, z0.s[7]
-; VBITS_GE_256-NEXT:    mov z6.s, z17.s[7]
-; VBITS_GE_256-NEXT:    insr z16.s, s19
-; VBITS_GE_256-NEXT:    insr z2.s, s18
-; VBITS_GE_256-NEXT:    insr z17.s, s0
-; VBITS_GE_256-NEXT:    insr z5.s, s6
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x0, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z16.s }, p0, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z17.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v64f32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    mov w8, #63
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    whilels p1.s, xzr, x8
-; VBITS_GE_2048-NEXT:    lastb s0, p1, z0.s
-; VBITS_GE_2048-NEXT:    insr z1.s, s0
-; VBITS_GE_2048-NEXT:    st1w { z1.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @shuffle_ext_byone_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v64f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    mov w8, #63
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.s, xzr, x8
+; CHECK-NEXT:    lastb s0, p1, z0.s
+; CHECK-NEXT:    insr z1.s, s0
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <64 x float>, <64 x float>* %a
   %op2 = load <64 x float>, <64 x float>* %b
   %ret = shufflevector <64 x float> %op1, <64 x float> %op2, <64 x i32> <i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70,
@@ -1277,7 +789,7 @@ define void @shuffle_ext_byone_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 }
 
 ; Don't use SVE for 128-bit vectors
-define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
+define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
@@ -1286,7 +798,7 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op
   ret <2 x double> %ret
 }
 
-define void @shuffle_ext_byone_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @shuffle_ext_byone_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -1304,7 +816,6 @@ define void @shuffle_ext_byone_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
 }
 
 define void @shuffle_ext_byone_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
-; Ensure sensible type legalisation.
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v8f64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    mov x8, #4
@@ -1336,43 +847,18 @@ define void @shuffle_ext_byone_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v16f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #12
-; VBITS_GE_256-NEXT:    mov x10, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z5.d, z0.d[3]
-; VBITS_GE_256-NEXT:    insr z1.d, d5
-; VBITS_GE_256-NEXT:    mov z5.d, z3.d[3]
-; VBITS_GE_256-NEXT:    mov z2.d, z2.d[3]
-; VBITS_GE_256-NEXT:    insr z0.d, d5
-; VBITS_GE_256-NEXT:    mov z5.d, z4.d[3]
-; VBITS_GE_256-NEXT:    insr z4.d, d2
-; VBITS_GE_256-NEXT:    insr z3.d, d5
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: shuffle_ext_byone_v16f64:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    mov w8, #15
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    whilels p1.d, xzr, x8
-; VBITS_GE_1024-NEXT:    lastb d0, p1, z0.d
-; VBITS_GE_1024-NEXT:    insr z1.d, d0
-; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
+define void @shuffle_ext_byone_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    mov w8, #15
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.d, xzr, x8
+; CHECK-NEXT:    lastb d0, p1, z0.d
+; CHECK-NEXT:    insr z1.d, d0
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <16 x double>, <16 x double>* %a
   %op2 = load <16 x double>, <16 x double>* %b
   %ret = shufflevector <16 x double> %op1, <16 x double> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
@@ -1381,63 +867,18 @@ define void @shuffle_ext_byone_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
-; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x10, #16
-; VBITS_GE_256-NEXT:    mov x9, #20
-; VBITS_GE_256-NEXT:    mov x11, #4
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x12, #8
-; VBITS_GE_256-NEXT:    mov x13, #12
-; VBITS_GE_256-NEXT:    mov x8, #28
-; VBITS_GE_256-NEXT:    mov x14, #24
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x10, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x1, x11, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x12, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z16.d }, p0/z, [x1, x13, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z7.d }, p0/z, [x1, x14, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z17.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z18.d, z3.d[3]
-; VBITS_GE_256-NEXT:    mov z6.d, z1.d[3]
-; VBITS_GE_256-NEXT:    insr z1.d, d18
-; VBITS_GE_256-NEXT:    mov z18.d, z5.d[3]
-; VBITS_GE_256-NEXT:    mov z19.d, z4.d[3]
-; VBITS_GE_256-NEXT:    insr z4.d, d18
-; VBITS_GE_256-NEXT:    mov z18.d, z16.d[3]
-; VBITS_GE_256-NEXT:    insr z3.d, d18
-; VBITS_GE_256-NEXT:    mov z18.d, z7.d[3]
-; VBITS_GE_256-NEXT:    insr z7.d, d6
-; VBITS_GE_256-NEXT:    mov z0.d, z0.d[3]
-; VBITS_GE_256-NEXT:    mov z6.d, z17.d[3]
-; VBITS_GE_256-NEXT:    insr z16.d, d19
-; VBITS_GE_256-NEXT:    insr z2.d, d18
-; VBITS_GE_256-NEXT:    insr z17.d, d0
-; VBITS_GE_256-NEXT:    insr z5.d, d6
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z7.d }, p0, [x0, x14, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0, x10, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z16.d }, p0, [x0, x13, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z4.d }, p0, [x0, x12, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z5.d }, p0, [x0, x11, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z17.d }, p0, [x0]
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: shuffle_ext_byone_v32f64:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    mov w8, #31
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    whilels p1.d, xzr, x8
-; VBITS_GE_2048-NEXT:    lastb d0, p1, z0.d
-; VBITS_GE_2048-NEXT:    insr z1.d, d0
-; VBITS_GE_2048-NEXT:    st1d { z1.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+define void @shuffle_ext_byone_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
+; CHECK-LABEL: shuffle_ext_byone_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    mov w8, #31
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    whilels p1.d, xzr, x8
+; CHECK-NEXT:    lastb d0, p1, z0.d
+; CHECK-NEXT:    insr z1.d, d0
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
+; CHECK-NEXT:    ret
   %op1 = load <32 x double>, <32 x double>* %a
   %op2 = load <32 x double>, <32 x double>* %b
   %ret = shufflevector <32 x double> %op1, <32 x double> %op2, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,
@@ -1448,7 +889,7 @@ define void @shuffle_ext_byone_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_byone_reverse(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @shuffle_ext_byone_reverse(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_byone_reverse:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
@@ -1465,7 +906,7 @@ define void @shuffle_ext_byone_reverse(<4 x double>* %a, <4 x double>* %b) #0 {
   ret void
 }
 
-define void @shuffle_ext_invalid(<4 x double>* %a, <4 x double>* %b) #0 {
+define void @shuffle_ext_invalid(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: shuffle_ext_invalid:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
@@ -1487,11 +928,7 @@ define void @shuffle_ext_invalid(<4 x double>* %a, <4 x double>* %b) #0 {
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    .cfi_restore w30
-; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    ret
   %op1 = load <4 x double>, <4 x double>* %a
   %op2 = load <4 x double>, <4 x double>* %b
@@ -1500,4 +937,4 @@ define void @shuffle_ext_invalid(<4 x double>* %a, <4 x double>* %b) #0 {
   ret void
 }
 
-attributes #0 = { "target-features"="+sve" uwtable }
+attributes #0 = { "target-features"="+sve" }


        


More information about the llvm-commits mailing list