[llvm] [LLVM][CodeGen][SVE] Remove failure cases when widening vector load/store ops. (PR #160515)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 24 06:01:12 PDT 2025
https://github.com/paulwalker-arm created https://github.com/llvm/llvm-project/pull/160515
When unable to widen a vector load/store we can replace the operation with a masked variant. Support for extending loads largely came for free hence its inclusion, but truncating stores require more work.
NOTE: The generated code can be improved but I figured it best to solve the functional issues first.
NOTE: The VE change is required because I generalised the legalisation to support target's whose masks are not i1 based, although to truly exercise that requires a target with scalable vectors and no dedicated predicate registers, which we don't have. It looks like a clear omission hence the "fix", but if that's not the case I can remove it and just recalculate `WideMaskVT` for the non-VP-load/store code.
Fixes https://github.com/llvm/llvm-project/issues/159995
>From 55dee1b105c404e0902043197b0e7271e4457419 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Mon, 22 Sep 2025 18:34:29 +0100
Subject: [PATCH 1/3] Add legalisation tests for scalable vector load/store
ops.
---
.../AArch64/sve-load-store-legalisation.ll | 1204 +++++++++++++++++
1 file changed, 1204 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
diff --git a/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
new file mode 100644
index 0000000000000..4da999ce3e8f9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
@@ -0,0 +1,1204 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;define void @sve_load_store_nxv1i8(ptr %a, ptr %b) {
+; %c = load <vscale x 1 x i8>, ptr %a
+; store <vscale x 1 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i8>, ptr %a
+ store <vscale x 2 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv3i8(ptr %a, ptr %b) {
+; %c = load <vscale x 3 x i8>, ptr %a
+; store <vscale x 3 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i8>, ptr %a
+ store <vscale x 4 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv5i8(ptr %a, ptr %b) {
+; %c = load <vscale x 5 x i8>, ptr %a
+; store <vscale x 5 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1b { z1.s }, p1, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1b { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i8>, ptr %a
+ store <vscale x 6 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv7i8(ptr %a, ptr %b) {
+; %c = load <vscale x 7 x i8>, ptr %a
+; store <vscale x 7 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i8>, ptr %a
+ store <vscale x 8 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv9i8(ptr %a, ptr %b) {
+; %c = load <vscale x 9 x i8>, ptr %a
+; store <vscale x 9 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p1, [x1]
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1b { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i8>, ptr %a
+ store <vscale x 10 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv11i8(ptr %a, ptr %b) {
+; %c = load <vscale x 11 x i8>, ptr %a
+; store <vscale x 11 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p1, [x1]
+; CHECK-NEXT: st1b { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i8>, ptr %a
+ store <vscale x 12 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv13i8(ptr %a, ptr %b) {
+; %c = load <vscale x 13 x i8>, ptr %a
+; store <vscale x 13 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ptrue p2.h
+; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: ld1b { z1.h }, p2/z, [x0]
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p2, [x1]
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: st1b { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z2.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i8>, ptr %a
+ store <vscale x 14 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv15i8(ptr %a, ptr %b) {
+; %c = load <vscale x 15 x i8>, ptr %a
+; store <vscale x 15 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i8>, ptr %a
+ store <vscale x 16 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv17i8(ptr %a, ptr %b) {
+; %c = load <vscale x 17 x i8>, ptr %a
+; store <vscale x 17 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv18i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv18i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z0.s, z2.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z2.s, z0.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z0.s, z2.s
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z2.s, z0.s
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1b { z0.d }, p0, [x1, x8]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 18 x i8>, ptr %a
+ store <vscale x 18 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv19i8(ptr %a, ptr %b) {
+; %c = load <vscale x 19 x i8>, ptr %a
+; store <vscale x 19 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv20i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv20i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1b { z0.s }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 20 x i8>, ptr %a
+ store <vscale x 20 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv21i8(ptr %a, ptr %b) {
+; %c = load <vscale x 21 x i8>, ptr %a
+; store <vscale x 21 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv22i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv22i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: cntw x8, all, mul #5
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: ld1b { z1.d }, p1/z, [x0, x8]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1b { z1.d }, p1, [x1, x8]
+; CHECK-NEXT: st1b { z0.s }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 22 x i8>, ptr %a
+ store <vscale x 22 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv23i8(ptr %a, ptr %b) {
+; %c = load <vscale x 23 x i8>, ptr %a
+; store <vscale x 23 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv24i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv24i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: st1b { z0.h }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 24 x i8>, ptr %a
+ store <vscale x 24 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv25i8(ptr %a, ptr %b) {
+; %c = load <vscale x 25 x i8>, ptr %a
+; store <vscale x 25 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv26i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv26i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: cnth x8, all, mul #3
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1b { z1.d }, p0, [x1, x8]
+; CHECK-NEXT: st1b { z0.h }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 26 x i8>, ptr %a
+ store <vscale x 26 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv27i8(ptr %a, ptr %b) {
+; %c = load <vscale x 27 x i8>, ptr %a
+; store <vscale x 27 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv28i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv28i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z1.s }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 28 x i8>, ptr %a
+ store <vscale x 28 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv29i8(ptr %a, ptr %b) {
+; %c = load <vscale x 29 x i8>, ptr %a
+; store <vscale x 29 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv30i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv30i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: cntw x8, all, mul #7
+; CHECK-NEXT: ldr z3, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT: ptrue p2.h
+; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1b { z2.h }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z2.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: st1b { z2.d }, p0, [x1, x8]
+; CHECK-NEXT: st1b { z0.h }, p2, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z1.s }, p1, [x1, #6, mul vl]
+; CHECK-NEXT: str z3, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 30 x i8>, ptr %a
+ store <vscale x 30 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv31i8(ptr %a, ptr %b) {
+; %c = load <vscale x 31 x i8>, ptr %a
+; store <vscale x 31 x i8> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv32i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv32i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 32 x i8>, ptr %a
+ store <vscale x 32 x i8> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv1i16(ptr %a, ptr %b) {
+; %c = load <vscale x 1 x i16>, ptr %a
+; store <vscale x 1 x i16> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i16>, ptr %a
+ store <vscale x 2 x i16> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv3i16(ptr %a, ptr %b) {
+; %c = load <vscale x 3 x i16>, ptr %a
+; store <vscale x 3 x i16> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i16>, ptr %a
+ store <vscale x 4 x i16> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv5i16(ptr %a, ptr %b) {
+; %c = load <vscale x 5 x i16>, ptr %a
+; store <vscale x 5 x i16> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [x1]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i16>, ptr %a
+ store <vscale x 6 x i16> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv7i16(ptr %a, ptr %b) {
+; %c = load <vscale x 7 x i16>, ptr %a
+; store <vscale x 7 x i16> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i16>, ptr %a
+ store <vscale x 8 x i16> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv9i16(ptr %a, ptr %b) {
+; %c = load <vscale x 9 x i16>, ptr %a
+; store <vscale x 9 x i16> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv10i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z1.h, z0.h, z0.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i16>, ptr %a
+ store <vscale x 10 x i16> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv11i16(ptr %a, ptr %b) {
+; %c = load <vscale x 11 x i16>, ptr %a
+; store <vscale x 11 x i16> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv12i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1h { z0.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i16>, ptr %a
+ store <vscale x 12 x i16> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv13i16(ptr %a, ptr %b) {
+; %c = load <vscale x 13 x i16>, ptr %a
+; store <vscale x 13 x i16> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv14i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i16>, ptr %a
+ store <vscale x 14 x i16> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv15i16(ptr %a, ptr %b) {
+; %c = load <vscale x 15 x i16>, ptr %a
+; store <vscale x 15 x i16> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv16i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i16>, ptr %a
+ store <vscale x 16 x i16> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv1i32(ptr %a, ptr %b) {
+; %c = load <vscale x 1 x i32>, ptr %a
+; store <vscale x 1 x i32> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i32>, ptr %a
+ store <vscale x 2 x i32> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv3i32(ptr %a, ptr %b) {
+; %c = load <vscale x 3 x i32>, ptr %a
+; store <vscale x 3 x i32> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i32>, ptr %a
+ store <vscale x 4 x i32> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv5i32(ptr %a, ptr %b) {
+; %c = load <vscale x 5 x i32>, ptr %a
+; store <vscale x 5 x i32> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv6i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1w { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i32>, ptr %a
+ store <vscale x 6 x i32> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv7i32(ptr %a, ptr %b) {
+; %c = load <vscale x 7 x i32>, ptr %a
+; store <vscale x 7 x i32> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv8i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i32>, ptr %a
+ store <vscale x 8 x i32> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv1i64(ptr %a, ptr %b) {
+; %c = load <vscale x 1 x i64>, ptr %a
+; store <vscale x 1 x i64> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv2i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i64>, ptr %a
+ store <vscale x 2 x i64> %c, ptr %b
+ ret void
+}
+
+;define void @sve_load_store_nxv3i64(ptr %a, ptr %b) {
+; %c = load <vscale x 3 x i64>, ptr %a
+; store <vscale x 3 x i64> %c, ptr %b
+; ret void
+;}
+
+define void @sve_load_store_nxv4i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i64>, ptr %a
+ store <vscale x 4 x i64> %c, ptr %b
+ ret void
+}
+
+;define <vscale x 1 x i16> @sve_sextload_nxv1i8(ptr %a, ptr %b) {
+; %c = load <vscale x 1 x i8>, ptr %a
+; %c.sext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
+; ret <vscale x 1 x i16> %c.sext
+;}
+
+define <vscale x 2 x i16> @sve_sextload_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i8>, ptr %a
+ %c.sext = sext <vscale x 2 x i8> %c to <vscale x 2 x i16>
+ ret <vscale x 2 x i16> %c.sext
+}
+
+;define <vscale x 3 x i16> @sve_sextload_nxv3i8(ptr %a, ptr %b) {
+; %c = load <vscale x 3 x i8>, ptr %a
+; %c.sext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
+; ret <vscale x 3 x i16> %c.sext
+;}
+
+define <vscale x 4 x i16> @sve_sextload_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i8>, ptr %a
+ %c.sext = sext <vscale x 4 x i8> %c to <vscale x 4 x i16>
+ ret <vscale x 4 x i16> %c.sext
+}
+
+;define <vscale x 5 x i16> @sve_sextload_nxv5i8(ptr %a, ptr %b) {
+; %c = load <vscale x 5 x i8>, ptr %a
+; %c.sext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
+; ret <vscale x 5 x i16> %c.sext
+;}
+
+;define <vscale x 6 x i16> @sve_sextload_nxv6i8(ptr %a, ptr %b) {
+; %c = load <vscale x 6 x i8>, ptr %a
+; %c.sext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
+; ret <vscale x 6 x i16> %c.sext
+;}
+
+;define <vscale x 7 x i16> @sve_sextload_nxv7i8(ptr %a, ptr %b) {
+; %c = load <vscale x 7 x i8>, ptr %a
+; %c.sext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
+; ret <vscale x 7 x i16> %c.sext
+;}
+
+define <vscale x 8 x i16> @sve_sextload_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i8>, ptr %a
+ %c.sext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %c.sext
+}
+
+;define <vscale x 9 x i16> @sve_sextload_nxv9i8(ptr %a, ptr %b) {
+; %c = load <vscale x 9 x i8>, ptr %a
+; %c.sext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
+; ret <vscale x 9 x i16> %c.sext
+;}
+
+;define <vscale x 10 x i16> @sve_sextload_nxv10i8(ptr %a, ptr %b) {
+; %c = load <vscale x 10 x i8>, ptr %a
+; %c.sext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
+; ret <vscale x 10 x i16> %c.sext
+;}
+
+;define <vscale x 11 x i16> @sve_sextload_nxv11i8(ptr %a, ptr %b) {
+; %c = load <vscale x 11 x i8>, ptr %a
+; %c.sext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
+; ret <vscale x 11 x i16> %c.sext
+;}
+
+;define <vscale x 12 x i16> @sve_sextload_nxv12i8(ptr %a, ptr %b) {
+; %c = load <vscale x 12 x i8>, ptr %a
+; %c.sext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
+; ret <vscale x 12 x i16> %c.sext
+;}
+
+;define <vscale x 13 x i16> @sve_sextload_nxv13i8(ptr %a, ptr %b) {
+; %c = load <vscale x 13 x i8>, ptr %a
+; %c.sext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
+; ret <vscale x 13 x i16> %c.sext
+;}
+
+;define <vscale x 14 x i16> @sve_sextload_nxv14i8(ptr %a, ptr %b) {
+; %c = load <vscale x 14 x i8>, ptr %a
+; %c.sext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
+; ret <vscale x 14 x i16> %c.sext
+;}
+
+;define <vscale x 15 x i16> @sve_sextload_nxv15i8(ptr %a, ptr %b) {
+; %c = load <vscale x 15 x i8>, ptr %a
+; %c.sext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
+; ret <vscale x 15 x i16> %c.sext
+;}
+
+define <vscale x 16 x i16> @sve_sextload_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i8>, ptr %a
+ %c.sext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16>
+ ret <vscale x 16 x i16> %c.sext
+}
+
+;define <vscale x 1 x i32> @sve_sextload_nxv1i16(ptr %a, ptr %b) {
+; %c = load <vscale x 1 x i16>, ptr %a
+; %c.sext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
+; ret <vscale x 1 x i32> %c.sext
+;}
+
+define <vscale x 2 x i32> @sve_sextload_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i16>, ptr %a
+ %c.sext = sext <vscale x 2 x i16> %c to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %c.sext
+}
+
+;define <vscale x 3 x i32> @sve_sextload_nxv3i16(ptr %a, ptr %b) {
+; %c = load <vscale x 3 x i16>, ptr %a
+; %c.sext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
+; ret <vscale x 3 x i32> %c.sext
+;}
+
+define <vscale x 4 x i32> @sve_sextload_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i16>, ptr %a
+ %c.sext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %c.sext
+}
+
+;define <vscale x 5 x i32> @sve_sextload_nxv5i16(ptr %a, ptr %b) {
+; %c = load <vscale x 5 x i16>, ptr %a
+; %c.sext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
+; ret <vscale x 5 x i32> %c.sext
+;}
+
+;define <vscale x 6 x i32> @sve_sextload_nxv6i16(ptr %a, ptr %b) {
+; %c = load <vscale x 6 x i16>, ptr %a
+; %c.sext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
+; ret <vscale x 6 x i32> %c.sext
+;}
+
+;define <vscale x 7 x i32> @sve_sextload_nxv7i16(ptr %a, ptr %b) {
+; %c = load <vscale x 7 x i16>, ptr %a
+; %c.sext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
+; ret <vscale x 7 x i32> %c.sext
+;}
+
+define <vscale x 8 x i32> @sve_sextload_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i16>, ptr %a
+ %c.sext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32>
+ ret <vscale x 8 x i32> %c.sext
+}
+
+;define <vscale x 1 x i64> @sve_sextload_nxv1i32(ptr %a, ptr %b) {
+; %c = load <vscale x 1 x i32>, ptr %a
+; %c.sext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
+; ret <vscale x 1 x i64> %c.sext
+;}
+
+define <vscale x 2 x i64> @sve_sextload_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i32>, ptr %a
+ %c.sext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %c.sext
+}
+
+;define <vscale x 3 x i64> @sve_sextload_nxv3i32(ptr %a, ptr %b) {
+; %c = load <vscale x 3 x i32>, ptr %a
+; %c.sext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
+; ret <vscale x 3 x i64> %c.sext
+;}
+
+define <vscale x 4 x i64> @sve_sextload_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i32>, ptr %a
+ %c.sext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+ ret <vscale x 4 x i64> %c.sext
+}
+
+;define <vscale x 1 x i16> @sve_zextload_nxv1i8(ptr %a, ptr %b) {
+; %c = load <vscale x 1 x i8>, ptr %a
+; %c.zext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
+; ret <vscale x 1 x i16> %c.zext
+;}
+
+define <vscale x 2 x i16> @sve_zextload_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i8>, ptr %a
+ %c.zext = sext <vscale x 2 x i8> %c to <vscale x 2 x i16>
+ ret <vscale x 2 x i16> %c.zext
+}
+
+;define <vscale x 3 x i16> @sve_zextload_nxv3i8(ptr %a, ptr %b) {
+; %c = load <vscale x 3 x i8>, ptr %a
+; %c.zext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
+; ret <vscale x 3 x i16> %c.zext
+;}
+
+define <vscale x 4 x i16> @sve_zextload_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i8>, ptr %a
+ %c.zext = sext <vscale x 4 x i8> %c to <vscale x 4 x i16>
+ ret <vscale x 4 x i16> %c.zext
+}
+
+;define <vscale x 5 x i16> @sve_zextload_nxv5i8(ptr %a, ptr %b) {
+; %c = load <vscale x 5 x i8>, ptr %a
+; %c.zext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
+; ret <vscale x 5 x i16> %c.zext
+;}
+
+;define <vscale x 6 x i16> @sve_zextload_nxv6i8(ptr %a, ptr %b) {
+; %c = load <vscale x 6 x i8>, ptr %a
+; %c.zext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
+; ret <vscale x 6 x i16> %c.zext
+;}
+
+;define <vscale x 7 x i16> @sve_zextload_nxv7i8(ptr %a, ptr %b) {
+; %c = load <vscale x 7 x i8>, ptr %a
+; %c.zext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
+; ret <vscale x 7 x i16> %c.zext
+;}
+
+define <vscale x 8 x i16> @sve_zextload_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i8>, ptr %a
+ %c.zext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %c.zext
+}
+
+;define <vscale x 9 x i16> @sve_zextload_nxv9i8(ptr %a, ptr %b) {
+; %c = load <vscale x 9 x i8>, ptr %a
+; %c.zext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
+; ret <vscale x 9 x i16> %c.zext
+;}
+
+;define <vscale x 10 x i16> @sve_zextload_nxv10i8(ptr %a, ptr %b) {
+; %c = load <vscale x 10 x i8>, ptr %a
+; %c.zext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
+; ret <vscale x 10 x i16> %c.zext
+;}
+
+;define <vscale x 11 x i16> @sve_zextload_nxv11i8(ptr %a, ptr %b) {
+; %c = load <vscale x 11 x i8>, ptr %a
+; %c.zext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
+; ret <vscale x 11 x i16> %c.zext
+;}
+
+;define <vscale x 12 x i16> @sve_zextload_nxv12i8(ptr %a, ptr %b) {
+; %c = load <vscale x 12 x i8>, ptr %a
+; %c.zext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
+; ret <vscale x 12 x i16> %c.zext
+;}
+
+;define <vscale x 13 x i16> @sve_zextload_nxv13i8(ptr %a, ptr %b) {
+; %c = load <vscale x 13 x i8>, ptr %a
+; %c.zext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
+; ret <vscale x 13 x i16> %c.zext
+;}
+
+;define <vscale x 14 x i16> @sve_zextload_nxv14i8(ptr %a, ptr %b) {
+; %c = load <vscale x 14 x i8>, ptr %a
+; %c.zext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
+; ret <vscale x 14 x i16> %c.zext
+;}
+
+;define <vscale x 15 x i16> @sve_zextload_nxv15i8(ptr %a, ptr %b) {
+; %c = load <vscale x 15 x i8>, ptr %a
+; %c.zext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
+; ret <vscale x 15 x i16> %c.zext
+;}
+
+define <vscale x 16 x i16> @sve_zextload_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i8>, ptr %a
+ %c.zext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16>
+ ret <vscale x 16 x i16> %c.zext
+}
+
+;define <vscale x 1 x i32> @sve_zextload_nxv1i16(ptr %a, ptr %b) {
+; %c = load <vscale x 1 x i16>, ptr %a
+; %c.zext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
+; ret <vscale x 1 x i32> %c.zext
+;}
+
+define <vscale x 2 x i32> @sve_zextload_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i16>, ptr %a
+ %c.zext = sext <vscale x 2 x i16> %c to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %c.zext
+}
+
+;define <vscale x 3 x i32> @sve_zextload_nxv3i16(ptr %a, ptr %b) {
+; %c = load <vscale x 3 x i16>, ptr %a
+; %c.zext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
+; ret <vscale x 3 x i32> %c.zext
+;}
+
+define <vscale x 4 x i32> @sve_zextload_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i16>, ptr %a
+ %c.zext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %c.zext
+}
+
+;define <vscale x 5 x i32> @sve_zextload_nxv5i16(ptr %a, ptr %b) {
+; %c = load <vscale x 5 x i16>, ptr %a
+; %c.zext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
+; ret <vscale x 5 x i32> %c.zext
+;}
+
+;define <vscale x 6 x i32> @sve_zextload_nxv6i16(ptr %a, ptr %b) {
+; %c = load <vscale x 6 x i16>, ptr %a
+; %c.zext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
+; ret <vscale x 6 x i32> %c.zext
+;}
+
+;define <vscale x 7 x i32> @sve_zextload_nxv7i16(ptr %a, ptr %b) {
+; %c = load <vscale x 7 x i16>, ptr %a
+; %c.zext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
+; ret <vscale x 7 x i32> %c.zext
+;}
+
+define <vscale x 8 x i32> @sve_zextload_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i16>, ptr %a
+ %c.zext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32>
+ ret <vscale x 8 x i32> %c.zext
+}
+
+;define <vscale x 1 x i64> @sve_zextload_nxv1i32(ptr %a, ptr %b) {
+; %c = load <vscale x 1 x i32>, ptr %a
+; %c.zext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
+; ret <vscale x 1 x i64> %c.zext
+;}
+
+define <vscale x 2 x i64> @sve_zextload_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i32>, ptr %a
+ %c.zext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %c.zext
+}
+
+;define <vscale x 3 x i64> @sve_zextload_nxv3i32(ptr %a, ptr %b) {
+; %c = load <vscale x 3 x i32>, ptr %a
+; %c.zext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
+; ret <vscale x 3 x i64> %c.zext
+;}
+
+define <vscale x 4 x i64> @sve_zextload_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i32>, ptr %a
+ %c.zext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+ ret <vscale x 4 x i64> %c.zext
+}
>From 9ff812ca203016d480c2db03af449c3d5709723e Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Mon, 22 Sep 2025 18:34:29 +0100
Subject: [PATCH 2/3] [LLVM][CodeGen][SVE] Remove failure cases when widening
vector load/store ops.
When unable to widen a vector load/store we can replace the operation with a
masked variant. Support for extending loads largely came for free hence
its inclusion, but truncating stores require more work.
NOTE: The generated code can be improved but I figured it best to solve
the functional issues first.
---
.../SelectionDAG/LegalizeVectorTypes.cpp | 50 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 1 +
.../AArch64/sve-load-store-legalisation.ll | 1660 +++++++++++++----
3 files changed, 1350 insertions(+), 361 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index ff7cd665446cc..87d5453cd98cf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6256,17 +6256,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
// FIXME: Not all targets may support EVL in VP_LOAD. These will have been
// removed from the IR by the ExpandVectorPredication pass but we're
// reintroducing them here.
- EVT LdVT = LD->getMemoryVT();
- EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), LdVT);
- EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- WideVT.getVectorElementCount());
+ EVT VT = LD->getValueType(0);
+ EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ EVT WideMaskVT = getSetCCResultType(WideVT);
+
if (ExtType == ISD::NON_EXTLOAD &&
TLI.isOperationLegalOrCustom(ISD::VP_LOAD, WideVT) &&
TLI.isTypeLegal(WideMaskVT)) {
SDLoc DL(N);
SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT);
SDValue EVL = DAG.getElementCount(DL, TLI.getVPExplicitVectorLengthTy(),
- LdVT.getVectorElementCount());
+ VT.getVectorElementCount());
SDValue NewLoad =
DAG.getLoadVP(LD->getAddressingMode(), ISD::NON_EXTLOAD, WideVT, DL,
LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
@@ -6303,6 +6303,24 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
return Result;
}
+ if (VT.isVector()) {
+ // If all else fails replace the load with a wide masked load.
+ SDLoc DL(N);
+ EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+
+ SDValue Len = DAG.getElementCount(DL, IdxVT, VT.getVectorElementCount());
+ SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
+ DAG.getConstant(0, DL, IdxVT), Len);
+
+ SDValue NewLoad = DAG.getMaskedLoad(
+ WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
+ DAG.getPOISON(WideVT), LD->getMemoryVT(), LD->getMemOperand(),
+ LD->getAddressingMode(), LD->getExtensionType());
+
+ ReplaceValueWith(SDValue(N, 1), NewLoad.getValue(1));
+ return NewLoad;
+ }
+
report_fatal_error("Unable to widen vector load");
}
@@ -7516,8 +7534,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
SDValue StVal = ST->getValue();
EVT StVT = StVal.getValueType();
EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StVT);
- EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- WideVT.getVectorElementCount());
+ EVT WideMaskVT = getSetCCResultType(WideVT);
if (TLI.isOperationLegalOrCustom(ISD::VP_STORE, WideVT) &&
TLI.isTypeLegal(WideMaskVT)) {
@@ -7540,6 +7557,22 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
}
+ if (StVT.isVector()) {
+ // If all else fails replace the store with a wide masked store.
+ SDLoc DL(N);
+ EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+
+ SDValue WideStVal = GetWidenedVector(StVal);
+ SDValue Len = DAG.getElementCount(DL, IdxVT, StVT.getVectorElementCount());
+ SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
+ DAG.getConstant(0, DL, IdxVT), Len);
+
+ return DAG.getMaskedStore(ST->getChain(), DL, WideStVal, ST->getBasePtr(),
+ ST->getOffset(), Mask, ST->getMemoryVT(),
+ ST->getMemOperand(), ST->getAddressingMode(),
+ ST->isTruncatingStore());
+ }
+
report_fatal_error("Unable to widen vector store");
}
@@ -8298,8 +8331,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
AAMDNodes AAInfo = LD->getAAInfo();
if (LdVT.isScalableVector())
- report_fatal_error("Generating widen scalable extending vector loads is "
- "not yet supported");
+ return SDValue();
EVT EltVT = WidenVT.getVectorElementType();
EVT LdEltVT = LdVT.getVectorElementType();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 09b31616e0882..c0d06b917b19f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1537,6 +1537,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::MULHS, VT, Custom);
setOperationAction(ISD::MULHU, VT, Custom);
diff --git a/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
index 4da999ce3e8f9..32ad2fce74d3f 100644
--- a/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
+++ b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
@@ -3,11 +3,19 @@
target triple = "aarch64-unknown-linux-gnu"
-;define void @sve_load_store_nxv1i8(ptr %a, ptr %b) {
-; %c = load <vscale x 1 x i8>, ptr %a
-; store <vscale x 1 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i8>, ptr %a
+ store <vscale x 1 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv2i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv2i8:
@@ -21,11 +29,21 @@ define void @sve_load_store_nxv2i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv3i8(ptr %a, ptr %b) {
-; %c = load <vscale x 3 x i8>, ptr %a
-; store <vscale x 3 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i8>, ptr %a
+ store <vscale x 3 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv4i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv4i8:
@@ -39,11 +57,21 @@ define void @sve_load_store_nxv4i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv5i8(ptr %a, ptr %b) {
-; %c = load <vscale x 5 x i8>, ptr %a
-; store <vscale x 5 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i8>, ptr %a
+ store <vscale x 5 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv6i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv6i8:
@@ -64,11 +92,21 @@ define void @sve_load_store_nxv6i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv7i8(ptr %a, ptr %b) {
-; %c = load <vscale x 7 x i8>, ptr %a
-; store <vscale x 7 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i8>, ptr %a
+ store <vscale x 7 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv8i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv8i8:
@@ -82,11 +120,21 @@ define void @sve_load_store_nxv8i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv9i8(ptr %a, ptr %b) {
-; %c = load <vscale x 9 x i8>, ptr %a
-; store <vscale x 9 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i8>, ptr %a
+ store <vscale x 9 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv10i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv10i8:
@@ -110,11 +158,21 @@ define void @sve_load_store_nxv10i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv11i8(ptr %a, ptr %b) {
-; %c = load <vscale x 11 x i8>, ptr %a
-; store <vscale x 11 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i8>, ptr %a
+ store <vscale x 11 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv12i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv12i8:
@@ -136,11 +194,21 @@ define void @sve_load_store_nxv12i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv13i8(ptr %a, ptr %b) {
-; %c = load <vscale x 13 x i8>, ptr %a
-; store <vscale x 13 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i8>, ptr %a
+ store <vscale x 13 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv14i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv14i8:
@@ -168,11 +236,21 @@ define void @sve_load_store_nxv14i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv15i8(ptr %a, ptr %b) {
-; %c = load <vscale x 15 x i8>, ptr %a
-; store <vscale x 15 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i8>, ptr %a
+ store <vscale x 15 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv16i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv16i8:
@@ -185,11 +263,24 @@ define void @sve_load_store_nxv16i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv17i8(ptr %a, ptr %b) {
-; %c = load <vscale x 17 x i8>, ptr %a
-; store <vscale x 17 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv17i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv17i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #17 // =0x11
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 17 x i8>, ptr %a
+ store <vscale x 17 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv18i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv18i8:
@@ -261,11 +352,24 @@ define void @sve_load_store_nxv18i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv19i8(ptr %a, ptr %b) {
-; %c = load <vscale x 19 x i8>, ptr %a
-; store <vscale x 19 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv19i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv19i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #19 // =0x13
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 19 x i8>, ptr %a
+ store <vscale x 19 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv20i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv20i8:
@@ -297,11 +401,24 @@ define void @sve_load_store_nxv20i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv21i8(ptr %a, ptr %b) {
-; %c = load <vscale x 21 x i8>, ptr %a
-; store <vscale x 21 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv21i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv21i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #21 // =0x15
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 21 x i8>, ptr %a
+ store <vscale x 21 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv22i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv22i8:
@@ -340,11 +457,24 @@ define void @sve_load_store_nxv22i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv23i8(ptr %a, ptr %b) {
-; %c = load <vscale x 23 x i8>, ptr %a
-; store <vscale x 23 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv23i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv23i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #23 // =0x17
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 23 x i8>, ptr %a
+ store <vscale x 23 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv24i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv24i8:
@@ -362,11 +492,24 @@ define void @sve_load_store_nxv24i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv25i8(ptr %a, ptr %b) {
-; %c = load <vscale x 25 x i8>, ptr %a
-; store <vscale x 25 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv25i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv25i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #25 // =0x19
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 25 x i8>, ptr %a
+ store <vscale x 25 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv26i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv26i8:
@@ -393,11 +536,24 @@ define void @sve_load_store_nxv26i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv27i8(ptr %a, ptr %b) {
-; %c = load <vscale x 27 x i8>, ptr %a
-; store <vscale x 27 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv27i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv27i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #27 // =0x1b
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 27 x i8>, ptr %a
+ store <vscale x 27 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv28i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv28i8:
@@ -421,11 +577,24 @@ define void @sve_load_store_nxv28i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv29i8(ptr %a, ptr %b) {
-; %c = load <vscale x 29 x i8>, ptr %a
-; store <vscale x 29 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv29i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv29i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #29 // =0x1d
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 29 x i8>, ptr %a
+ store <vscale x 29 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv30i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv30i8:
@@ -456,11 +625,24 @@ define void @sve_load_store_nxv30i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv31i8(ptr %a, ptr %b) {
-; %c = load <vscale x 31 x i8>, ptr %a
-; store <vscale x 31 x i8> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv31i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv31i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #31 // =0x1f
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 31 x i8>, ptr %a
+ store <vscale x 31 x i8> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv32i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv32i8:
@@ -475,11 +657,19 @@ define void @sve_load_store_nxv32i8(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv1i16(ptr %a, ptr %b) {
-; %c = load <vscale x 1 x i16>, ptr %a
-; store <vscale x 1 x i16> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i16>, ptr %a
+ store <vscale x 1 x i16> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv2i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv2i16:
@@ -493,11 +683,21 @@ define void @sve_load_store_nxv2i16(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv3i16(ptr %a, ptr %b) {
-; %c = load <vscale x 3 x i16>, ptr %a
-; store <vscale x 3 x i16> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i16>, ptr %a
+ store <vscale x 3 x i16> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv4i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv4i16:
@@ -511,11 +711,21 @@ define void @sve_load_store_nxv4i16(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv5i16(ptr %a, ptr %b) {
-; %c = load <vscale x 5 x i16>, ptr %a
-; store <vscale x 5 x i16> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i16>, ptr %a
+ store <vscale x 5 x i16> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv6i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv6i16:
@@ -537,11 +747,21 @@ define void @sve_load_store_nxv6i16(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv7i16(ptr %a, ptr %b) {
-; %c = load <vscale x 7 x i16>, ptr %a
-; store <vscale x 7 x i16> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i16>, ptr %a
+ store <vscale x 7 x i16> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv8i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv8i16:
@@ -554,11 +774,25 @@ define void @sve_load_store_nxv8i16(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv9i16(ptr %a, ptr %b) {
-; %c = load <vscale x 9 x i16>, ptr %a
-; store <vscale x 9 x i16> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv9i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i16>, ptr %a
+ store <vscale x 9 x i16> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv10i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv10i16:
@@ -590,11 +824,25 @@ define void @sve_load_store_nxv10i16(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv11i16(ptr %a, ptr %b) {
-; %c = load <vscale x 11 x i16>, ptr %a
-; store <vscale x 11 x i16> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv11i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i16>, ptr %a
+ store <vscale x 11 x i16> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv12i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv12i16:
@@ -612,11 +860,25 @@ define void @sve_load_store_nxv12i16(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv13i16(ptr %a, ptr %b) {
-; %c = load <vscale x 13 x i16>, ptr %a
-; store <vscale x 13 x i16> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv13i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i16>, ptr %a
+ store <vscale x 13 x i16> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv14i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv14i16:
@@ -640,11 +902,25 @@ define void @sve_load_store_nxv14i16(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv15i16(ptr %a, ptr %b) {
-; %c = load <vscale x 15 x i16>, ptr %a
-; store <vscale x 15 x i16> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv15i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i16>, ptr %a
+ store <vscale x 15 x i16> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv16i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv16i16:
@@ -659,11 +935,19 @@ define void @sve_load_store_nxv16i16(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv1i32(ptr %a, ptr %b) {
-; %c = load <vscale x 1 x i32>, ptr %a
-; store <vscale x 1 x i32> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i32>, ptr %a
+ store <vscale x 1 x i32> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv2i32(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv2i32:
@@ -677,11 +961,21 @@ define void @sve_load_store_nxv2i32(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv3i32(ptr %a, ptr %b) {
-; %c = load <vscale x 3 x i32>, ptr %a
-; store <vscale x 3 x i32> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i32>, ptr %a
+ store <vscale x 3 x i32> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv4i32(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv4i32:
@@ -694,11 +988,25 @@ define void @sve_load_store_nxv4i32(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv5i32(ptr %a, ptr %b) {
-; %c = load <vscale x 5 x i32>, ptr %a
-; store <vscale x 5 x i32> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv5i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i32>, ptr %a
+ store <vscale x 5 x i32> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv6i32(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv6i32:
@@ -716,11 +1024,25 @@ define void @sve_load_store_nxv6i32(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv7i32(ptr %a, ptr %b) {
-; %c = load <vscale x 7 x i32>, ptr %a
-; store <vscale x 7 x i32> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv7i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i32>, ptr %a
+ store <vscale x 7 x i32> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv8i32(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv8i32:
@@ -735,11 +1057,19 @@ define void @sve_load_store_nxv8i32(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv1i64(ptr %a, ptr %b) {
-; %c = load <vscale x 1 x i64>, ptr %a
-; store <vscale x 1 x i64> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv1i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i64>, ptr %a
+ store <vscale x 1 x i64> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv2i64(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv2i64:
@@ -752,11 +1082,25 @@ define void @sve_load_store_nxv2i64(ptr %a, ptr %b) {
ret void
}
-;define void @sve_load_store_nxv3i64(ptr %a, ptr %b) {
-; %c = load <vscale x 3 x i64>, ptr %a
-; store <vscale x 3 x i64> %c, ptr %b
-; ret void
-;}
+define void @sve_load_store_nxv3i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i64>, ptr %a
+ store <vscale x 3 x i64> %c, ptr %b
+ ret void
+}
define void @sve_load_store_nxv4i64(ptr %a, ptr %b) {
; CHECK-LABEL: sve_load_store_nxv4i64:
@@ -771,11 +1115,18 @@ define void @sve_load_store_nxv4i64(ptr %a, ptr %b) {
ret void
}
-;define <vscale x 1 x i16> @sve_sextload_nxv1i8(ptr %a, ptr %b) {
-; %c = load <vscale x 1 x i8>, ptr %a
-; %c.sext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
-; ret <vscale x 1 x i16> %c.sext
-;}
+define <vscale x 1 x i16> @sve_sextload_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i8>, ptr %a
+ %c.sext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
+ ret <vscale x 1 x i16> %c.sext
+}
define <vscale x 2 x i16> @sve_sextload_nxv2i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_sextload_nxv2i8:
@@ -788,11 +1139,20 @@ define <vscale x 2 x i16> @sve_sextload_nxv2i8(ptr %a, ptr %b) {
ret <vscale x 2 x i16> %c.sext
}
-;define <vscale x 3 x i16> @sve_sextload_nxv3i8(ptr %a, ptr %b) {
-; %c = load <vscale x 3 x i8>, ptr %a
-; %c.sext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
-; ret <vscale x 3 x i16> %c.sext
-;}
+define <vscale x 3 x i16> @sve_sextload_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i8>, ptr %a
+ %c.sext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
+ ret <vscale x 3 x i16> %c.sext
+}
define <vscale x 4 x i16> @sve_sextload_nxv4i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_sextload_nxv4i8:
@@ -805,23 +1165,47 @@ define <vscale x 4 x i16> @sve_sextload_nxv4i8(ptr %a, ptr %b) {
ret <vscale x 4 x i16> %c.sext
}
-;define <vscale x 5 x i16> @sve_sextload_nxv5i8(ptr %a, ptr %b) {
-; %c = load <vscale x 5 x i8>, ptr %a
-; %c.sext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
-; ret <vscale x 5 x i16> %c.sext
-;}
+define <vscale x 5 x i16> @sve_sextload_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv5i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i8>, ptr %a
+ %c.sext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
+ ret <vscale x 5 x i16> %c.sext
+}
-;define <vscale x 6 x i16> @sve_sextload_nxv6i8(ptr %a, ptr %b) {
-; %c = load <vscale x 6 x i8>, ptr %a
-; %c.sext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
-; ret <vscale x 6 x i16> %c.sext
-;}
+define <vscale x 6 x i16> @sve_sextload_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv6i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i8>, ptr %a
+ %c.sext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
+ ret <vscale x 6 x i16> %c.sext
+}
-;define <vscale x 7 x i16> @sve_sextload_nxv7i8(ptr %a, ptr %b) {
-; %c = load <vscale x 7 x i8>, ptr %a
-; %c.sext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
-; ret <vscale x 7 x i16> %c.sext
-;}
+define <vscale x 7 x i16> @sve_sextload_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i8>, ptr %a
+ %c.sext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
+ ret <vscale x 7 x i16> %c.sext
+}
define <vscale x 8 x i16> @sve_sextload_nxv8i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_sextload_nxv8i8:
@@ -834,47 +1218,203 @@ define <vscale x 8 x i16> @sve_sextload_nxv8i8(ptr %a, ptr %b) {
ret <vscale x 8 x i16> %c.sext
}
-;define <vscale x 9 x i16> @sve_sextload_nxv9i8(ptr %a, ptr %b) {
-; %c = load <vscale x 9 x i8>, ptr %a
-; %c.sext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
-; ret <vscale x 9 x i16> %c.sext
-;}
-
-;define <vscale x 10 x i16> @sve_sextload_nxv10i8(ptr %a, ptr %b) {
-; %c = load <vscale x 10 x i8>, ptr %a
-; %c.sext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
-; ret <vscale x 10 x i16> %c.sext
-;}
-
-;define <vscale x 11 x i16> @sve_sextload_nxv11i8(ptr %a, ptr %b) {
-; %c = load <vscale x 11 x i8>, ptr %a
-; %c.sext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
-; ret <vscale x 11 x i16> %c.sext
-;}
-
-;define <vscale x 12 x i16> @sve_sextload_nxv12i8(ptr %a, ptr %b) {
-; %c = load <vscale x 12 x i8>, ptr %a
-; %c.sext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
-; ret <vscale x 12 x i16> %c.sext
-;}
-
-;define <vscale x 13 x i16> @sve_sextload_nxv13i8(ptr %a, ptr %b) {
-; %c = load <vscale x 13 x i8>, ptr %a
-; %c.sext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
-; ret <vscale x 13 x i16> %c.sext
-;}
-
-;define <vscale x 14 x i16> @sve_sextload_nxv14i8(ptr %a, ptr %b) {
-; %c = load <vscale x 14 x i8>, ptr %a
-; %c.sext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
-; ret <vscale x 14 x i16> %c.sext
-;}
-
-;define <vscale x 15 x i16> @sve_sextload_nxv15i8(ptr %a, ptr %b) {
-; %c = load <vscale x 15 x i8>, ptr %a
-; %c.sext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
-; ret <vscale x 15 x i16> %c.sext
-;}
+define <vscale x 9 x i16> @sve_sextload_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv9i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i8>, ptr %a
+ %c.sext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
+ ret <vscale x 9 x i16> %c.sext
+}
+
+define <vscale x 10 x i16> @sve_sextload_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv10i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #5
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i8>, ptr %a
+ %c.sext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
+ ret <vscale x 10 x i16> %c.sext
+}
+
+define <vscale x 11 x i16> @sve_sextload_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv11i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i8>, ptr %a
+ %c.sext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
+ ret <vscale x 11 x i16> %c.sext
+}
+
+define <vscale x 12 x i16> @sve_sextload_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv12i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i8>, ptr %a
+ %c.sext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
+ ret <vscale x 12 x i16> %c.sext
+}
+
+define <vscale x 13 x i16> @sve_sextload_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv13i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i8>, ptr %a
+ %c.sext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
+ ret <vscale x 13 x i16> %c.sext
+}
+
+define <vscale x 14 x i16> @sve_sextload_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv14i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #7
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z2, [sp]
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: st1h { z1.d }, p0, [sp, #6, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i8>, ptr %a
+ %c.sext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
+ ret <vscale x 14 x i16> %c.sext
+}
+
+define <vscale x 15 x i16> @sve_sextload_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv15i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i8>, ptr %a
+ %c.sext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
+ ret <vscale x 15 x i16> %c.sext
+}
define <vscale x 16 x i16> @sve_sextload_nxv16i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_sextload_nxv16i8:
@@ -888,11 +1428,18 @@ define <vscale x 16 x i16> @sve_sextload_nxv16i8(ptr %a, ptr %b) {
ret <vscale x 16 x i16> %c.sext
}
-;define <vscale x 1 x i32> @sve_sextload_nxv1i16(ptr %a, ptr %b) {
-; %c = load <vscale x 1 x i16>, ptr %a
-; %c.sext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
-; ret <vscale x 1 x i32> %c.sext
-;}
+define <vscale x 1 x i32> @sve_sextload_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i16>, ptr %a
+ %c.sext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
+ ret <vscale x 1 x i32> %c.sext
+}
define <vscale x 2 x i32> @sve_sextload_nxv2i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_sextload_nxv2i16:
@@ -905,11 +1452,20 @@ define <vscale x 2 x i32> @sve_sextload_nxv2i16(ptr %a, ptr %b) {
ret <vscale x 2 x i32> %c.sext
}
-;define <vscale x 3 x i32> @sve_sextload_nxv3i16(ptr %a, ptr %b) {
-; %c = load <vscale x 3 x i16>, ptr %a
-; %c.sext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
-; ret <vscale x 3 x i32> %c.sext
-;}
+define <vscale x 3 x i32> @sve_sextload_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i16>, ptr %a
+ %c.sext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
+ ret <vscale x 3 x i32> %c.sext
+}
define <vscale x 4 x i32> @sve_sextload_nxv4i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_sextload_nxv4i16:
@@ -922,23 +1478,88 @@ define <vscale x 4 x i32> @sve_sextload_nxv4i16(ptr %a, ptr %b) {
ret <vscale x 4 x i32> %c.sext
}
-;define <vscale x 5 x i32> @sve_sextload_nxv5i16(ptr %a, ptr %b) {
-; %c = load <vscale x 5 x i16>, ptr %a
-; %c.sext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
-; ret <vscale x 5 x i32> %c.sext
-;}
+define <vscale x 5 x i32> @sve_sextload_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv5i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i16>, ptr %a
+ %c.sext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
+ ret <vscale x 5 x i32> %c.sext
+}
-;define <vscale x 6 x i32> @sve_sextload_nxv6i16(ptr %a, ptr %b) {
-; %c = load <vscale x 6 x i16>, ptr %a
-; %c.sext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
-; ret <vscale x 6 x i32> %c.sext
-;}
+define <vscale x 6 x i32> @sve_sextload_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv6i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1w { z0.d }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i16>, ptr %a
+ %c.sext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
+ ret <vscale x 6 x i32> %c.sext
+}
-;define <vscale x 7 x i32> @sve_sextload_nxv7i16(ptr %a, ptr %b) {
-; %c = load <vscale x 7 x i16>, ptr %a
-; %c.sext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
-; ret <vscale x 7 x i32> %c.sext
-;}
+define <vscale x 7 x i32> @sve_sextload_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i16>, ptr %a
+ %c.sext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
+ ret <vscale x 7 x i32> %c.sext
+}
define <vscale x 8 x i32> @sve_sextload_nxv8i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_sextload_nxv8i16:
@@ -952,11 +1573,18 @@ define <vscale x 8 x i32> @sve_sextload_nxv8i16(ptr %a, ptr %b) {
ret <vscale x 8 x i32> %c.sext
}
-;define <vscale x 1 x i64> @sve_sextload_nxv1i32(ptr %a, ptr %b) {
-; %c = load <vscale x 1 x i32>, ptr %a
-; %c.sext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
-; ret <vscale x 1 x i64> %c.sext
-;}
+define <vscale x 1 x i64> @sve_sextload_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i32>, ptr %a
+ %c.sext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
+ ret <vscale x 1 x i64> %c.sext
+}
define <vscale x 2 x i64> @sve_sextload_nxv2i32(ptr %a, ptr %b) {
; CHECK-LABEL: sve_sextload_nxv2i32:
@@ -969,11 +1597,33 @@ define <vscale x 2 x i64> @sve_sextload_nxv2i32(ptr %a, ptr %b) {
ret <vscale x 2 x i64> %c.sext
}
-;define <vscale x 3 x i64> @sve_sextload_nxv3i32(ptr %a, ptr %b) {
-; %c = load <vscale x 3 x i32>, ptr %a
-; %c.sext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
-; ret <vscale x 3 x i64> %c.sext
-;}
+define <vscale x 3 x i64> @sve_sextload_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i32>, ptr %a
+ %c.sext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
+ ret <vscale x 3 x i64> %c.sext
+}
define <vscale x 4 x i64> @sve_sextload_nxv4i32(ptr %a, ptr %b) {
; CHECK-LABEL: sve_sextload_nxv4i32:
@@ -987,11 +1637,18 @@ define <vscale x 4 x i64> @sve_sextload_nxv4i32(ptr %a, ptr %b) {
ret <vscale x 4 x i64> %c.sext
}
-;define <vscale x 1 x i16> @sve_zextload_nxv1i8(ptr %a, ptr %b) {
-; %c = load <vscale x 1 x i8>, ptr %a
-; %c.zext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
-; ret <vscale x 1 x i16> %c.zext
-;}
+define <vscale x 1 x i16> @sve_zextload_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i8>, ptr %a
+ %c.zext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
+ ret <vscale x 1 x i16> %c.zext
+}
define <vscale x 2 x i16> @sve_zextload_nxv2i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_zextload_nxv2i8:
@@ -1004,11 +1661,20 @@ define <vscale x 2 x i16> @sve_zextload_nxv2i8(ptr %a, ptr %b) {
ret <vscale x 2 x i16> %c.zext
}
-;define <vscale x 3 x i16> @sve_zextload_nxv3i8(ptr %a, ptr %b) {
-; %c = load <vscale x 3 x i8>, ptr %a
-; %c.zext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
-; ret <vscale x 3 x i16> %c.zext
-;}
+define <vscale x 3 x i16> @sve_zextload_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i8>, ptr %a
+ %c.zext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
+ ret <vscale x 3 x i16> %c.zext
+}
define <vscale x 4 x i16> @sve_zextload_nxv4i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_zextload_nxv4i8:
@@ -1021,23 +1687,47 @@ define <vscale x 4 x i16> @sve_zextload_nxv4i8(ptr %a, ptr %b) {
ret <vscale x 4 x i16> %c.zext
}
-;define <vscale x 5 x i16> @sve_zextload_nxv5i8(ptr %a, ptr %b) {
-; %c = load <vscale x 5 x i8>, ptr %a
-; %c.zext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
-; ret <vscale x 5 x i16> %c.zext
-;}
+define <vscale x 5 x i16> @sve_zextload_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv5i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i8>, ptr %a
+ %c.zext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
+ ret <vscale x 5 x i16> %c.zext
+}
-;define <vscale x 6 x i16> @sve_zextload_nxv6i8(ptr %a, ptr %b) {
-; %c = load <vscale x 6 x i8>, ptr %a
-; %c.zext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
-; ret <vscale x 6 x i16> %c.zext
-;}
+define <vscale x 6 x i16> @sve_zextload_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv6i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i8>, ptr %a
+ %c.zext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
+ ret <vscale x 6 x i16> %c.zext
+}
-;define <vscale x 7 x i16> @sve_zextload_nxv7i8(ptr %a, ptr %b) {
-; %c = load <vscale x 7 x i8>, ptr %a
-; %c.zext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
-; ret <vscale x 7 x i16> %c.zext
-;}
+define <vscale x 7 x i16> @sve_zextload_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i8>, ptr %a
+ %c.zext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
+ ret <vscale x 7 x i16> %c.zext
+}
define <vscale x 8 x i16> @sve_zextload_nxv8i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_zextload_nxv8i8:
@@ -1050,47 +1740,203 @@ define <vscale x 8 x i16> @sve_zextload_nxv8i8(ptr %a, ptr %b) {
ret <vscale x 8 x i16> %c.zext
}
-;define <vscale x 9 x i16> @sve_zextload_nxv9i8(ptr %a, ptr %b) {
-; %c = load <vscale x 9 x i8>, ptr %a
-; %c.zext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
-; ret <vscale x 9 x i16> %c.zext
-;}
-
-;define <vscale x 10 x i16> @sve_zextload_nxv10i8(ptr %a, ptr %b) {
-; %c = load <vscale x 10 x i8>, ptr %a
-; %c.zext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
-; ret <vscale x 10 x i16> %c.zext
-;}
-
-;define <vscale x 11 x i16> @sve_zextload_nxv11i8(ptr %a, ptr %b) {
-; %c = load <vscale x 11 x i8>, ptr %a
-; %c.zext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
-; ret <vscale x 11 x i16> %c.zext
-;}
-
-;define <vscale x 12 x i16> @sve_zextload_nxv12i8(ptr %a, ptr %b) {
-; %c = load <vscale x 12 x i8>, ptr %a
-; %c.zext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
-; ret <vscale x 12 x i16> %c.zext
-;}
-
-;define <vscale x 13 x i16> @sve_zextload_nxv13i8(ptr %a, ptr %b) {
-; %c = load <vscale x 13 x i8>, ptr %a
-; %c.zext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
-; ret <vscale x 13 x i16> %c.zext
-;}
-
-;define <vscale x 14 x i16> @sve_zextload_nxv14i8(ptr %a, ptr %b) {
-; %c = load <vscale x 14 x i8>, ptr %a
-; %c.zext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
-; ret <vscale x 14 x i16> %c.zext
-;}
-
-;define <vscale x 15 x i16> @sve_zextload_nxv15i8(ptr %a, ptr %b) {
-; %c = load <vscale x 15 x i8>, ptr %a
-; %c.zext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
-; ret <vscale x 15 x i16> %c.zext
-;}
+define <vscale x 9 x i16> @sve_zextload_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv9i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i8>, ptr %a
+ %c.zext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
+ ret <vscale x 9 x i16> %c.zext
+}
+
+define <vscale x 10 x i16> @sve_zextload_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv10i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #5
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i8>, ptr %a
+ %c.zext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
+ ret <vscale x 10 x i16> %c.zext
+}
+
+define <vscale x 11 x i16> @sve_zextload_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv11i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i8>, ptr %a
+ %c.zext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
+ ret <vscale x 11 x i16> %c.zext
+}
+
+define <vscale x 12 x i16> @sve_zextload_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv12i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i8>, ptr %a
+ %c.zext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
+ ret <vscale x 12 x i16> %c.zext
+}
+
+define <vscale x 13 x i16> @sve_zextload_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv13i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i8>, ptr %a
+ %c.zext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
+ ret <vscale x 13 x i16> %c.zext
+}
+
+define <vscale x 14 x i16> @sve_zextload_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv14i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #7
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z2, [sp]
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: st1h { z1.d }, p0, [sp, #6, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i8>, ptr %a
+ %c.zext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
+ ret <vscale x 14 x i16> %c.zext
+}
+
+define <vscale x 15 x i16> @sve_zextload_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv15i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i8>, ptr %a
+ %c.zext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
+ ret <vscale x 15 x i16> %c.zext
+}
define <vscale x 16 x i16> @sve_zextload_nxv16i8(ptr %a, ptr %b) {
; CHECK-LABEL: sve_zextload_nxv16i8:
@@ -1104,11 +1950,18 @@ define <vscale x 16 x i16> @sve_zextload_nxv16i8(ptr %a, ptr %b) {
ret <vscale x 16 x i16> %c.zext
}
-;define <vscale x 1 x i32> @sve_zextload_nxv1i16(ptr %a, ptr %b) {
-; %c = load <vscale x 1 x i16>, ptr %a
-; %c.zext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
-; ret <vscale x 1 x i32> %c.zext
-;}
+define <vscale x 1 x i32> @sve_zextload_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i16>, ptr %a
+ %c.zext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
+ ret <vscale x 1 x i32> %c.zext
+}
define <vscale x 2 x i32> @sve_zextload_nxv2i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_zextload_nxv2i16:
@@ -1121,11 +1974,20 @@ define <vscale x 2 x i32> @sve_zextload_nxv2i16(ptr %a, ptr %b) {
ret <vscale x 2 x i32> %c.zext
}
-;define <vscale x 3 x i32> @sve_zextload_nxv3i16(ptr %a, ptr %b) {
-; %c = load <vscale x 3 x i16>, ptr %a
-; %c.zext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
-; ret <vscale x 3 x i32> %c.zext
-;}
+define <vscale x 3 x i32> @sve_zextload_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i16>, ptr %a
+ %c.zext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
+ ret <vscale x 3 x i32> %c.zext
+}
define <vscale x 4 x i32> @sve_zextload_nxv4i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_zextload_nxv4i16:
@@ -1138,23 +2000,88 @@ define <vscale x 4 x i32> @sve_zextload_nxv4i16(ptr %a, ptr %b) {
ret <vscale x 4 x i32> %c.zext
}
-;define <vscale x 5 x i32> @sve_zextload_nxv5i16(ptr %a, ptr %b) {
-; %c = load <vscale x 5 x i16>, ptr %a
-; %c.zext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
-; ret <vscale x 5 x i32> %c.zext
-;}
+define <vscale x 5 x i32> @sve_zextload_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv5i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i16>, ptr %a
+ %c.zext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
+ ret <vscale x 5 x i32> %c.zext
+}
-;define <vscale x 6 x i32> @sve_zextload_nxv6i16(ptr %a, ptr %b) {
-; %c = load <vscale x 6 x i16>, ptr %a
-; %c.zext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
-; ret <vscale x 6 x i32> %c.zext
-;}
+define <vscale x 6 x i32> @sve_zextload_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv6i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1w { z0.d }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i16>, ptr %a
+ %c.zext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
+ ret <vscale x 6 x i32> %c.zext
+}
-;define <vscale x 7 x i32> @sve_zextload_nxv7i16(ptr %a, ptr %b) {
-; %c = load <vscale x 7 x i16>, ptr %a
-; %c.zext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
-; ret <vscale x 7 x i32> %c.zext
-;}
+define <vscale x 7 x i32> @sve_zextload_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i16>, ptr %a
+ %c.zext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
+ ret <vscale x 7 x i32> %c.zext
+}
define <vscale x 8 x i32> @sve_zextload_nxv8i16(ptr %a, ptr %b) {
; CHECK-LABEL: sve_zextload_nxv8i16:
@@ -1168,11 +2095,18 @@ define <vscale x 8 x i32> @sve_zextload_nxv8i16(ptr %a, ptr %b) {
ret <vscale x 8 x i32> %c.zext
}
-;define <vscale x 1 x i64> @sve_zextload_nxv1i32(ptr %a, ptr %b) {
-; %c = load <vscale x 1 x i32>, ptr %a
-; %c.zext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
-; ret <vscale x 1 x i64> %c.zext
-;}
+define <vscale x 1 x i64> @sve_zextload_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i32>, ptr %a
+ %c.zext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
+ ret <vscale x 1 x i64> %c.zext
+}
define <vscale x 2 x i64> @sve_zextload_nxv2i32(ptr %a, ptr %b) {
; CHECK-LABEL: sve_zextload_nxv2i32:
@@ -1185,11 +2119,33 @@ define <vscale x 2 x i64> @sve_zextload_nxv2i32(ptr %a, ptr %b) {
ret <vscale x 2 x i64> %c.zext
}
-;define <vscale x 3 x i64> @sve_zextload_nxv3i32(ptr %a, ptr %b) {
-; %c = load <vscale x 3 x i32>, ptr %a
-; %c.zext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
-; ret <vscale x 3 x i64> %c.zext
-;}
+define <vscale x 3 x i64> @sve_zextload_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i32>, ptr %a
+ %c.zext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
+ ret <vscale x 3 x i64> %c.zext
+}
define <vscale x 4 x i64> @sve_zextload_nxv4i32(ptr %a, ptr %b) {
; CHECK-LABEL: sve_zextload_nxv4i32:
>From 35c34447f2bba1ee9b6867003b5080db7e3ba5da Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Wed, 24 Sep 2025 12:20:17 +0000
Subject: [PATCH 3/3] Fix missing vector support in
VETargetLowering::getSetCCResultType.
---
llvm/lib/Target/VE/VEISelLowering.cpp | 2 +
llvm/test/CodeGen/VE/Vector/vec_divrem.ll | 56 ++++++++++-------------
2 files changed, 27 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 2cfdc751a55e0..a068138791cb4 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -957,6 +957,8 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
EVT VT) const {
+ if (VT.isVector())
+ return VT.changeVectorElementType(MVT::i1);
return MVT::i32;
}
diff --git a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
index 3bc0aba8d4264..93e2889793ba5 100644
--- a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
@@ -7,19 +7,22 @@
define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) {
; CHECK-LABEL: udiv_by_minus_one:
; CHECK: # %bb.0:
-; CHECK-NEXT: and %s0, %s0, (56)0
-; CHECK-NEXT: lea %s4, 16843010
-; CHECK-NEXT: muls.l %s0, %s0, %s4
-; CHECK-NEXT: srl %s0, %s0, 32
+; CHECK-NEXT: and %s4, %s0, (56)0
; CHECK-NEXT: and %s1, %s1, (56)0
-; CHECK-NEXT: muls.l %s1, %s1, %s4
-; CHECK-NEXT: srl %s1, %s1, 32
; CHECK-NEXT: and %s2, %s2, (56)0
-; CHECK-NEXT: muls.l %s2, %s2, %s4
-; CHECK-NEXT: srl %s2, %s2, 32
; CHECK-NEXT: and %s3, %s3, (56)0
-; CHECK-NEXT: muls.l %s3, %s3, %s4
-; CHECK-NEXT: srl %s3, %s3, 32
+; CHECK-NEXT: or %s0, 0, (0)1
+; CHECK-NEXT: cmpu.w %s5, %s3, (56)0
+; CHECK-NEXT: or %s3, 0, (0)1
+; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s5
+; CHECK-NEXT: cmpu.w %s5, %s2, (56)0
+; CHECK-NEXT: or %s2, 0, (0)1
+; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s5
+; CHECK-NEXT: cmpu.w %s5, %s1, (56)0
+; CHECK-NEXT: or %s1, 0, (0)1
+; CHECK-NEXT: cmov.w.eq %s1, (63)0, %s5
+; CHECK-NEXT: cmpu.w %s4, %s4, (56)0
+; CHECK-NEXT: cmov.w.eq %s0, (63)0, %s4
; CHECK-NEXT: b.l.t (, %s10)
%r = udiv <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
ret <4 x i8> %r
@@ -28,27 +31,18 @@ define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) {
define <4 x i8> @urem_by_minus_one(<4 x i8> %x) {
; CHECK-LABEL: urem_by_minus_one:
; CHECK: # %bb.0:
-; CHECK-NEXT: and %s0, %s0, (56)0
-; CHECK-NEXT: and %s1, %s1, (56)0
-; CHECK-NEXT: and %s2, %s2, (56)0
-; CHECK-NEXT: and %s3, %s3, (56)0
-; CHECK-NEXT: lea %s4, 16843010
-; CHECK-NEXT: muls.l %s5, %s3, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s3, %s3, %s5
-; CHECK-NEXT: muls.l %s5, %s2, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s2, %s2, %s5
-; CHECK-NEXT: muls.l %s5, %s1, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s1, %s1, %s5
-; CHECK-NEXT: muls.l %s4, %s0, %s4
-; CHECK-NEXT: srl %s4, %s4, 32
-; CHECK-NEXT: muls.w.sx %s4, %s4, (56)0
-; CHECK-NEXT: subs.w.sx %s0, %s0, %s4
+; CHECK-NEXT: and %s4, %s0, (56)0
+; CHECK-NEXT: and %s5, %s1, (56)0
+; CHECK-NEXT: and %s6, %s2, (56)0
+; CHECK-NEXT: and %s7, %s3, (56)0
+; CHECK-NEXT: cmpu.w %s7, %s7, (56)0
+; CHECK-NEXT: cmov.w.eq %s3, (0)1, %s7
+; CHECK-NEXT: cmpu.w %s6, %s6, (56)0
+; CHECK-NEXT: cmov.w.eq %s2, (0)1, %s6
+; CHECK-NEXT: cmpu.w %s5, %s5, (56)0
+; CHECK-NEXT: cmov.w.eq %s1, (0)1, %s5
+; CHECK-NEXT: cmpu.w %s4, %s4, (56)0
+; CHECK-NEXT: cmov.w.eq %s0, (0)1, %s4
; CHECK-NEXT: b.l.t (, %s10)
%r = urem <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
ret <4 x i8> %r
More information about the llvm-commits
mailing list