[llvm] [AArch64] Improve lowering for scalable masked deinterleaving loads (PR #154338)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 19 06:57:02 PDT 2025
https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/154338
For IR like this:
%mask = ... @llvm.vector.interleave2(<vscale x 16 x i1> %a, <vscale x 16 x i1> %a)
%vec = ... @llvm.masked.load(..., <vscale x 32 x i1> %mask, ...)
%dvec = ... @llvm.vector.deinterleave2(<vscale x 32 x i8> %vec)
where we're deinterleaving a wide masked load of the supported type
and with an interleaved mask we can lower this directly to a ld2b
instruction. Similarly we can also support other variants of ld2
and ld4.
This PR adds a DAG combine to spot such patterns and lower to ld2X
or ld4X variants accordingly, whilst being careful to ensure the
masked load is only used by the deinterleave intrinsic.
>From 9e32691da19f5bb183f44d2030763c647dc94b73 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 19 Aug 2025 13:47:53 +0000
Subject: [PATCH 1/2] Add tests
---
.../AArch64/masked_deinterleaved_loads.ll | 669 ++++++++++++++++++
1 file changed, 669 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/masked_deinterleaved_loads.ll
diff --git a/llvm/test/CodeGen/AArch64/masked_deinterleaved_loads.ll b/llvm/test/CodeGen/AArch64/masked_deinterleaved_loads.ll
new file mode 100644
index 0000000000000..4b2032aa91e60
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/masked_deinterleaved_loads.ll
@@ -0,0 +1,669 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 16 x i8> @foo_ld2_nxv16i8(<vscale x 16 x i1> %mask, ptr %p) {
+; CHECK-LABEL: foo_ld2_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 p1.b, p0.b, p0.b
+; CHECK-NEXT: zip2 p0.b, p0.b, p0.b
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: uzp2 z2.b, z1.b, z0.b
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: add z0.b, z0.b, z2.b
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)
+ %strided.vec = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 1
+ %add = add <vscale x 16 x i8> %part1, %part2
+ ret <vscale x 16 x i8> %add
+}
+
+define <vscale x 8 x i16> @foo_ld2_nxv8i16(<vscale x 8 x i1> %mask, ptr %p) {
+; CHECK-LABEL: foo_ld2_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 p1.h, p0.h, p0.h
+; CHECK-NEXT: zip2 p0.h, p0.h, p0.h
+; CHECK-NEXT: ld1h { z1.h }, p1/z, [x0]
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: uzp2 z2.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: add z0.h, z0.h, z2.h
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 16 x i16> @llvm.masked.load.nxv16i16.p0(ptr %p, i32 2, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x i16> poison)
+ %strided.vec = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %strided.vec, 1
+ %add = add <vscale x 8 x i16> %part1, %part2
+ ret <vscale x 8 x i16> %add
+}
+
+define <vscale x 4 x float> @foo_ld2_nxv4f32(<vscale x 4 x i1> %mask, ptr %p) {
+; CHECK-LABEL: foo_ld2_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 p1.s, p0.s, p0.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p0.s
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [x0]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: uzp2 z2.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: fadd z0.s, z0.s, z2.s
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr %p, i32 4, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x float> poison)
+ %strided.vec = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 8 x float> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %strided.vec, 1
+ %add = fadd <vscale x 4 x float> %part1, %part2
+ ret <vscale x 4 x float> %add
+}
+
+define <vscale x 2 x double> @foo_ld2_nxv2f64(<vscale x 2 x i1> %mask, ptr %p) {
+; CHECK-LABEL: foo_ld2_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 p1.d, p0.d, p0.d
+; CHECK-NEXT: zip2 p0.d, p0.d, p0.d
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x0]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: uzp2 z2.d, z1.d, z0.d
+; CHECK-NEXT: uzp1 z0.d, z1.d, z0.d
+; CHECK-NEXT: fadd z0.d, z0.d, z2.d
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %p, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
+ %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+ %add = fadd <vscale x 2 x double> %part1, %part2
+ ret <vscale x 2 x double> %add
+}
+
+define <vscale x 16 x i8> @foo_ld4_nxv16i8(<vscale x 16 x i1> %mask, ptr %p) {
+; CHECK-LABEL: foo_ld4_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip2 p1.b, p0.b, p0.b
+; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
+; CHECK-NEXT: zip1 p2.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p1.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p3.b, p0.b, p0.b
+; CHECK-NEXT: ld1b { z3.b }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
+; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1b { z0.b }, p3/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0]
+; CHECK-NEXT: uzp2 z4.b, z3.b, z2.b
+; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
+; CHECK-NEXT: uzp2 z5.b, z1.b, z0.b
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uzp2 z1.b, z5.b, z4.b
+; CHECK-NEXT: uzp2 z3.b, z0.b, z2.b
+; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b
+; CHECK-NEXT: add z1.b, z3.b, z1.b
+; CHECK-NEXT: add z0.b, z0.b, z4.b
+; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 1, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
+ %strided.vec = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 3
+ %add1 = add <vscale x 16 x i8> %part1, %part2
+ %add2 = add <vscale x 16 x i8> %part3, %part4
+ %add3 = add <vscale x 16 x i8> %add1, %add2
+ ret <vscale x 16 x i8> %add3
+}
+
+define <vscale x 8 x i16> @foo_ld4_nxv8i16(<vscale x 8 x i1> %mask, ptr %p) {
+; CHECK-LABEL: foo_ld4_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip2 p1.h, p0.h, p0.h
+; CHECK-NEXT: zip1 p0.h, p0.h, p0.h
+; CHECK-NEXT: zip1 p2.h, p1.h, p1.h
+; CHECK-NEXT: zip2 p1.h, p1.h, p1.h
+; CHECK-NEXT: zip2 p3.h, p0.h, p0.h
+; CHECK-NEXT: ld1h { z3.h }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT: zip1 p0.h, p0.h, p0.h
+; CHECK-NEXT: ld1h { z2.h }, p1/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1h { z0.h }, p3/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: uzp2 z4.h, z3.h, z2.h
+; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT: uzp2 z5.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp2 z1.h, z5.h, z4.h
+; CHECK-NEXT: uzp2 z3.h, z0.h, z2.h
+; CHECK-NEXT: uzp1 z4.h, z5.h, z4.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: add z1.h, z3.h, z1.h
+; CHECK-NEXT: add z0.h, z0.h, z4.h
+; CHECK-NEXT: add z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 32 x i16> @llvm.masked.load.nxv32i16.p0(ptr %p, i32 2, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i16> poison)
+ %strided.vec = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave4.nxv32i16(<vscale x 32 x i16> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %strided.vec, 3
+ %add1 = add <vscale x 8 x i16> %part1, %part2
+ %add2 = add <vscale x 8 x i16> %part3, %part4
+ %add3 = add <vscale x 8 x i16> %add1, %add2
+ ret <vscale x 8 x i16> %add3
+}
+
+define <vscale x 4 x float> @foo_ld4_nxv4f32(<vscale x 4 x i1> %mask, ptr %p) {
+; CHECK-LABEL: foo_ld4_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip2 p1.s, p0.s, p0.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p0.s
+; CHECK-NEXT: zip1 p2.s, p1.s, p1.s
+; CHECK-NEXT: zip2 p1.s, p1.s, p1.s
+; CHECK-NEXT: zip2 p3.s, p0.s, p0.s
+; CHECK-NEXT: ld1w { z3.s }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT: zip1 p0.s, p0.s, p0.s
+; CHECK-NEXT: ld1w { z2.s }, p1/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1w { z0.s }, p3/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: uzp2 z4.s, z3.s, z2.s
+; CHECK-NEXT: uzp1 z2.s, z3.s, z2.s
+; CHECK-NEXT: uzp2 z5.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: uzp2 z1.s, z5.s, z4.s
+; CHECK-NEXT: uzp2 z3.s, z0.s, z2.s
+; CHECK-NEXT: uzp1 z4.s, z5.s, z4.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s
+; CHECK-NEXT: fadd z1.s, z3.s, z1.s
+; CHECK-NEXT: fadd z0.s, z0.s, z4.s
+; CHECK-NEXT: fadd z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr %p, i32 4, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x float> poison)
+ %strided.vec = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %strided.vec, 3
+ %add1 = fadd <vscale x 4 x float> %part1, %part2
+ %add2 = fadd <vscale x 4 x float> %part3, %part4
+ %add3 = fadd <vscale x 4 x float> %add1, %add2
+ ret <vscale x 4 x float> %add3
+}
+
+define <vscale x 2 x double> @foo_ld4_nxv2f64(<vscale x 2 x i1> %mask, ptr %p) {
+; CHECK-LABEL: foo_ld4_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip2 p1.d, p0.d, p0.d
+; CHECK-NEXT: zip1 p0.d, p0.d, p0.d
+; CHECK-NEXT: zip1 p2.d, p1.d, p1.d
+; CHECK-NEXT: zip2 p1.d, p1.d, p1.d
+; CHECK-NEXT: zip2 p3.d, p0.d, p0.d
+; CHECK-NEXT: ld1d { z3.d }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT: zip1 p0.d, p0.d, p0.d
+; CHECK-NEXT: ld1d { z2.d }, p1/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1d { z0.d }, p3/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: uzp2 z4.d, z3.d, z2.d
+; CHECK-NEXT: uzp1 z2.d, z3.d, z2.d
+; CHECK-NEXT: uzp2 z5.d, z1.d, z0.d
+; CHECK-NEXT: uzp1 z0.d, z1.d, z0.d
+; CHECK-NEXT: uzp2 z1.d, z5.d, z4.d
+; CHECK-NEXT: uzp2 z3.d, z0.d, z2.d
+; CHECK-NEXT: uzp1 z4.d, z5.d, z4.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z2.d
+; CHECK-NEXT: fadd z1.d, z3.d, z1.d
+; CHECK-NEXT: fadd z0.d, z0.d, z4.d
+; CHECK-NEXT: fadd z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 8 x double> @llvm.masked.load.nxv8f64.p0(ptr %p, i32 8, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x double> poison)
+ %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave4.nxv8f64(<vscale x 8 x double> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 3
+ %add1 = fadd <vscale x 2 x double> %part1, %part2
+ %add2 = fadd <vscale x 2 x double> %part3, %part4
+ %add3 = fadd <vscale x 2 x double> %add1, %add2
+ ret <vscale x 2 x double> %add3
+}
+
+
+define <vscale x 16 x i8> @foo_ld4_nxv16i8_mul_use_of_mask(<vscale x 16 x i1> %mask, ptr %p, ptr %p2) {
+; CHECK-LABEL: foo_ld4_nxv16i8_mul_use_of_mask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: zip2 p1.b, p0.b, p0.b
+; CHECK-NEXT: zip1 p4.b, p0.b, p0.b
+; CHECK-NEXT: zip1 p0.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p2.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p3.b, p4.b, p4.b
+; CHECK-NEXT: ld1b { z3.b }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: zip1 p1.b, p4.b, p4.b
+; CHECK-NEXT: ld1b { z2.b }, p2/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1b { z0.b }, p3/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: uzp2 z4.b, z3.b, z2.b
+; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
+; CHECK-NEXT: uzp2 z5.b, z1.b, z0.b
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uzp2 z3.b, z0.b, z2.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b
+; CHECK-NEXT: movi v2.2d, #0000000000000000
+; CHECK-NEXT: uzp2 z1.b, z5.b, z4.b
+; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
+; CHECK-NEXT: add z0.b, z0.b, z4.b
+; CHECK-NEXT: add z1.b, z3.b, z1.b
+; CHECK-NEXT: st1b { z2.b }, p2, [x1, #3, mul vl]
+; CHECK-NEXT: st1b { z2.b }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z2.b }, p3, [x1, #1, mul vl]
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: st1b { z2.b }, p1, [x1]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
+ %strided.vec = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 3
+ %add1 = add <vscale x 16 x i8> %part1, %part2
+ %add2 = add <vscale x 16 x i8> %part3, %part4
+ %add3 = add <vscale x 16 x i8> %add1, %add2
+ tail call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> zeroinitializer, ptr %p2, i32 1, <vscale x 64 x i1> %interleaved.mask)
+ ret <vscale x 16 x i8> %add3
+}
+
+define <vscale x 16 x i8> @foo_ld4_nxv16i8_mask_of_interleaved_ones(ptr %p) {
+; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_interleaved_ones:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: ldr z2, [x0, #3, mul vl]
+; CHECK-NEXT: ldr z3, [x0, #2, mul vl]
+; CHECK-NEXT: uzp2 z5.b, z1.b, z0.b
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uzp2 z4.b, z3.b, z2.b
+; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
+; CHECK-NEXT: uzp2 z1.b, z5.b, z4.b
+; CHECK-NEXT: uzp2 z3.b, z0.b, z2.b
+; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b
+; CHECK-NEXT: add z1.b, z3.b, z1.b
+; CHECK-NEXT: add z0.b, z0.b, z4.b
+; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1))
+ %wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
+ %strided.vec = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 3
+ %add1 = add <vscale x 16 x i8> %part1, %part2
+ %add2 = add <vscale x 16 x i8> %part3, %part4
+ %add3 = add <vscale x 16 x i8> %add1, %add2
+ ret <vscale x 16 x i8> %add3
+}
+
+define <vscale x 16 x i8> @foo_ld4_nxv16i8_mask_of_ones(ptr %p) {
+; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_ones:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: ldr z2, [x0, #3, mul vl]
+; CHECK-NEXT: ldr z3, [x0, #2, mul vl]
+; CHECK-NEXT: uzp2 z5.b, z1.b, z0.b
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uzp2 z4.b, z3.b, z2.b
+; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
+; CHECK-NEXT: uzp2 z1.b, z5.b, z4.b
+; CHECK-NEXT: uzp2 z3.b, z0.b, z2.b
+; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b
+; CHECK-NEXT: add z1.b, z3.b, z1.b
+; CHECK-NEXT: add z0.b, z0.b, z4.b
+; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+ %wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 4, <vscale x 64 x i1> splat(i1 1), <vscale x 64 x i8> poison)
+ %strided.vec = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 3
+ %add1 = add <vscale x 16 x i8> %part1, %part2
+ %add2 = add <vscale x 16 x i8> %part3, %part4
+ %add3 = add <vscale x 16 x i8> %add1, %add2
+ ret <vscale x 16 x i8> %add3
+}
+
+
+; Negative tests
+
+define <vscale x 16 x i8> @foo_ld4_nxv16i8_mul_use_of_load(<vscale x 16 x i1> %mask, ptr %p, ptr %p2) {
+; CHECK-LABEL: foo_ld4_nxv16i8_mul_use_of_load:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip2 p1.b, p0.b, p0.b
+; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
+; CHECK-NEXT: zip1 p2.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p1.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p3.b, p0.b, p0.b
+; CHECK-NEXT: ld1b { z4.b }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
+; CHECK-NEXT: ld1b { z3.b }, p1/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p3/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0]
+; CHECK-NEXT: str z4, [x1, #2, mul vl]
+; CHECK-NEXT: uzp2 z0.b, z4.b, z3.b
+; CHECK-NEXT: uzp1 z6.b, z4.b, z3.b
+; CHECK-NEXT: str z3, [x1, #3, mul vl]
+; CHECK-NEXT: str z1, [x1, #1, mul vl]
+; CHECK-NEXT: uzp2 z5.b, z2.b, z1.b
+; CHECK-NEXT: uzp1 z7.b, z2.b, z1.b
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp2 z24.b, z5.b, z0.b
+; CHECK-NEXT: uzp2 z25.b, z7.b, z6.b
+; CHECK-NEXT: uzp1 z0.b, z5.b, z0.b
+; CHECK-NEXT: uzp1 z5.b, z7.b, z6.b
+; CHECK-NEXT: add z0.b, z5.b, z0.b
+; CHECK-NEXT: add z5.b, z25.b, z24.b
+; CHECK-NEXT: add z0.b, z0.b, z5.b
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
+ %strided.vec = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 3
+ %add1 = add <vscale x 16 x i8> %part1, %part2
+ %add2 = add <vscale x 16 x i8> %part3, %part4
+ %add3 = add <vscale x 16 x i8> %add1, %add2
+ tail call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> %wide.masked.vec, ptr %p2, i32 1, <vscale x 64 x i1> splat(i1 1))
+ ret <vscale x 16 x i8> %add3
+}
+
+define <vscale x 16 x i8> @foo_ld4_nxv16i8_bad_mask(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask2, ptr %p, ptr %p2) {
+; CHECK-LABEL: foo_ld4_nxv16i8_bad_mask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: zip2 p2.b, p1.b, p0.b
+; CHECK-NEXT: zip2 p3.b, p0.b, p0.b
+; CHECK-NEXT: zip1 p1.b, p1.b, p0.b
+; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
+; CHECK-NEXT: zip1 p4.b, p3.b, p2.b
+; CHECK-NEXT: zip2 p2.b, p3.b, p2.b
+; CHECK-NEXT: zip2 p3.b, p0.b, p1.b
+; CHECK-NEXT: ld1b { z3.b }, p4/z, [x0, #2, mul vl]
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: zip1 p0.b, p0.b, p1.b
+; CHECK-NEXT: ld1b { z2.b }, p2/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1b { z0.b }, p3/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0]
+; CHECK-NEXT: uzp2 z4.b, z3.b, z2.b
+; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
+; CHECK-NEXT: uzp2 z5.b, z1.b, z0.b
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uzp2 z1.b, z5.b, z4.b
+; CHECK-NEXT: uzp2 z3.b, z0.b, z2.b
+; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b
+; CHECK-NEXT: add z1.b, z3.b, z1.b
+; CHECK-NEXT: add z0.b, z0.b, z4.b
+; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask2, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
+ %strided.vec = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 3
+ %add1 = add <vscale x 16 x i8> %part1, %part2
+ %add2 = add <vscale x 16 x i8> %part3, %part4
+ %add3 = add <vscale x 16 x i8> %add1, %add2
+ ret <vscale x 16 x i8> %add3
+}
+
+define <vscale x 16 x i8> @foo_ld4_nxv16i8_bad_mask2(<vscale x 32 x i1> %mask, ptr %p, ptr %p2) {
+; CHECK-LABEL: foo_ld4_nxv16i8_bad_mask2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 p2.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p1.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p3.b, p0.b, p0.b
+; CHECK-NEXT: ld1b { z3.b }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
+; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1b { z0.b }, p3/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0]
+; CHECK-NEXT: uzp2 z4.b, z3.b, z2.b
+; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
+; CHECK-NEXT: uzp2 z5.b, z1.b, z0.b
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uzp2 z1.b, z5.b, z4.b
+; CHECK-NEXT: uzp2 z3.b, z0.b, z2.b
+; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b
+; CHECK-NEXT: add z1.b, z3.b, z1.b
+; CHECK-NEXT: add z0.b, z0.b, z4.b
+; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> %mask, <vscale x 32 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
+ %strided.vec = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 3
+ %add1 = add <vscale x 16 x i8> %part1, %part2
+ %add2 = add <vscale x 16 x i8> %part3, %part4
+ %add3 = add <vscale x 16 x i8> %add1, %add2
+ ret <vscale x 16 x i8> %add3
+}
+
+define <vscale x 8 x i8> @foo_ld4_nxv8i8(<vscale x 8 x i1> %mask, ptr %p) {
+; CHECK-LABEL: foo_ld4_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip2 p1.h, p0.h, p0.h
+; CHECK-NEXT: zip1 p0.h, p0.h, p0.h
+; CHECK-NEXT: zip2 p2.h, p1.h, p1.h
+; CHECK-NEXT: zip1 p1.h, p1.h, p1.h
+; CHECK-NEXT: zip2 p3.h, p0.h, p0.h
+; CHECK-NEXT: zip1 p0.h, p0.h, p0.h
+; CHECK-NEXT: uzp1 p1.b, p1.b, p2.b
+; CHECK-NEXT: uzp1 p0.b, p0.b, p3.b
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0]
+; CHECK-NEXT: uunpkhi z2.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z3.h, z1.b
+; CHECK-NEXT: uunpklo z1.h, z1.b
+; CHECK-NEXT: uzp2 z4.h, z0.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: uzp2 z5.h, z1.h, z3.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z2.h, z5.h, z4.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z4.h, z5.h, z4.h
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: add z1.h, z3.h, z2.h
+; CHECK-NEXT: add z0.h, z0.h, z4.h
+; CHECK-NEXT: add z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)
+ %strided.vec = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %strided.vec, 3
+ %add1 = add <vscale x 8 x i8> %part1, %part2
+ %add2 = add <vscale x 8 x i8> %part3, %part4
+ %add3 = add <vscale x 8 x i8> %add1, %add2
+ ret <vscale x 8 x i8> %add3
+}
+
+define <vscale x 16 x i8> @foo_ld4_nxv16i8_bad_passthru(<vscale x 16 x i1> %mask, ptr %p) {
+; CHECK-LABEL: foo_ld4_nxv16i8_bad_passthru:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip2 p1.b, p0.b, p0.b
+; CHECK-NEXT: mov z0.b, #3 // =0x3
+; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
+; CHECK-NEXT: zip1 p2.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p1.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p3.b, p0.b, p0.b
+; CHECK-NEXT: ld1b { z4.b }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
+; CHECK-NEXT: ld1b { z3.b }, p1/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p3/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0]
+; CHECK-NEXT: sel z3.b, p1, z3.b, z0.b
+; CHECK-NEXT: sel z1.b, p3, z1.b, z0.b
+; CHECK-NEXT: sel z2.b, p0, z2.b, z0.b
+; CHECK-NEXT: mov z0.b, p2/m, z4.b
+; CHECK-NEXT: uzp2 z4.b, z0.b, z3.b
+; CHECK-NEXT: uzp2 z5.b, z2.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z3.b
+; CHECK-NEXT: uzp1 z1.b, z2.b, z1.b
+; CHECK-NEXT: uzp2 z2.b, z5.b, z4.b
+; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
+; CHECK-NEXT: uzp2 z3.b, z1.b, z0.b
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: add z0.b, z0.b, z4.b
+; CHECK-NEXT: add z1.b, z3.b, z2.b
+; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 1, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> splat(i8 3))
+ %strided.vec = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
+ %part1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec, 3
+ %add1 = add <vscale x 16 x i8> %part1, %part2
+ %add2 = add <vscale x 16 x i8> %part3, %part4
+ %add3 = add <vscale x 16 x i8> %add1, %add2
+ ret <vscale x 16 x i8> %add3
+}
+
+
+define <vscale x 16 x i16> @foo_ld4_nxv16i8_exti16(<vscale x 16 x i1> %mask, ptr %p) {
+; CHECK-LABEL: foo_ld4_nxv16i8_exti16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 p1.b, p0.b, p0.b
+; CHECK-NEXT: zip2 p0.b, p0.b, p0.b
+; CHECK-NEXT: zip1 p2.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p1.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p3.b, p0.b, p0.b
+; CHECK-NEXT: ld1b { z0.b }, p2/z, [x0]
+; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z2.b }, p3/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1b { z3.b }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: uunpkhi z4.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z5.h, z1.b
+; CHECK-NEXT: uunpklo z1.h, z1.b
+; CHECK-NEXT: uunpkhi z6.h, z2.b
+; CHECK-NEXT: uunpklo z2.h, z2.b
+; CHECK-NEXT: uunpkhi z7.h, z3.b
+; CHECK-NEXT: uunpklo z3.h, z3.b
+; CHECK-NEXT: uzp2 z25.h, z0.h, z4.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z4.h
+; CHECK-NEXT: uzp2 z24.h, z1.h, z5.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z5.h
+; CHECK-NEXT: uzp2 z4.h, z2.h, z6.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h
+; CHECK-NEXT: uzp2 z5.h, z3.h, z7.h
+; CHECK-NEXT: uzp1 z3.h, z3.h, z7.h
+; CHECK-NEXT: uzp2 z6.h, z25.h, z24.h
+; CHECK-NEXT: uzp2 z7.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp2 z26.h, z5.h, z4.h
+; CHECK-NEXT: uzp1 z4.h, z5.h, z4.h
+; CHECK-NEXT: uzp1 z5.h, z3.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z3.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z25.h, z24.h
+; CHECK-NEXT: add z1.h, z5.h, z4.h
+; CHECK-NEXT: add z0.h, z0.h, z3.h
+; CHECK-NEXT: add z2.h, z2.h, z26.h
+; CHECK-NEXT: add z3.h, z7.h, z6.h
+; CHECK-NEXT: add z0.h, z0.h, z3.h
+; CHECK-NEXT: add z1.h, z1.h, z2.h
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 1, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
+ %wide.masked.vec.ext = zext <vscale x 64 x i8> %wide.masked.vec to <vscale x 64 x i16>
+ %strided.vec = tail call { <vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i16> %wide.masked.vec.ext)
+ %part1 = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i16> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i16> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i16> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i16> } %strided.vec, 3
+ %add1 = add <vscale x 16 x i16> %part1, %part2
+ %add2 = add <vscale x 16 x i16> %part3, %part4
+ %add3 = add <vscale x 16 x i16> %add1, %add2
+ ret <vscale x 16 x i16> %add3
+}
+
+define <vscale x 2 x i16> @foo_ld4_nxv8i8_exti16(<vscale x 2 x i1> %mask, ptr %p) {
+; CHECK-LABEL: foo_ld4_nxv8i8_exti16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip2 p1.d, p0.d, p0.d
+; CHECK-NEXT: zip1 p0.d, p0.d, p0.d
+; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
+; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
+; CHECK-NEXT: zip2 p3.d, p0.d, p0.d
+; CHECK-NEXT: zip1 p0.d, p0.d, p0.d
+; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s
+; CHECK-NEXT: uzp1 p0.s, p0.s, p3.s
+; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpkhi z2.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpkhi z3.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: uzp2 z4.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: uzp2 z5.d, z0.d, z3.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z3.d
+; CHECK-NEXT: uzp2 z2.d, z5.d, z4.d
+; CHECK-NEXT: uzp2 z3.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z4.d, z5.d, z4.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: add z1.d, z3.d, z2.d
+; CHECK-NEXT: add z0.d, z0.d, z4.d
+; CHECK-NEXT: add z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %wide.masked.vec = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr %p, i32 1, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x i8> poison)
+ %wide.masked.vec.ext = zext <vscale x 8 x i8> %wide.masked.vec to <vscale x 8 x i16>
+ %strided.vec = tail call { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 8 x i16> %wide.masked.vec.ext)
+ %part1 = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } %strided.vec, 0
+ %part2 = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } %strided.vec, 1
+ %part3 = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } %strided.vec, 2
+ %part4 = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } %strided.vec, 3
+ %add1 = add <vscale x 2 x i16> %part1, %part2
+ %add2 = add <vscale x 2 x i16> %part3, %part4
+ %add3 = add <vscale x 2 x i16> %add1, %add2
+ ret <vscale x 2 x i16> %add3
+}
>From 62780d88b897921dba12f38b4950f29fce423554 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 19 Aug 2025 13:48:00 +0000
Subject: [PATCH 2/2] [AArch64] Improve lowering for scalable masked
deinterleaving loads
For IR like this:
%mask = ... @llvm.vector.interleave2(<vscale x 16 x i1> %a, <vscale x 16 x i1> %a)
%vec = ... @llvm.masked.load(..., <vscale x 32 x i1> %mask, ...)
%dvec = ... @llvm.vector.deinterleave2(<vscale x 32 x i8> %vec)
where we're deinterleaving a wide masked load of the supported type
and with an interleaved mask we can lower this directly to a ld2b
instruction. Similarly we can also support other variants of ld2
and ld4.
This PR adds a DAG combine to spot such patterns and lower to ld2X
or ld4X variants accordingly, whilst being careful to ensure the
masked load is only used by the deinterleave intrinsic.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 118 ++++++++++
.../AArch64/masked_deinterleaved_loads.ll | 221 ++++--------------
2 files changed, 165 insertions(+), 174 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 63a85faf344c4..cff28e006b22e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1178,6 +1178,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);
// In case of strict alignment, avoid an excessive number of byte wide stores.
MaxStoresPerMemsetOptSize = 8;
@@ -27010,6 +27011,121 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return NVCAST;
}
+static SDValue performVectorDeinterleaveCombine(
+ SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
+ unsigned NumParts = N->getNumOperands();
+ if (NumParts != 2 && NumParts != 4)
+ return SDValue();
+
+ EVT SubVecTy = N->getValueType(0);
+
+ // At the moment we're unlikely to see a fixed-width vector deinterleave as
+ // we usually generate shuffles instead.
+ unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
+ if (!SubVecTy.isScalableVT() ||
+ SubVecTy.getSizeInBits().getKnownMinValue() != 128 || MinNumElements == 1)
+ return SDValue();
+
+ // Make sure each input operand is the correct extract_subvector of the same
+ // wider vector.
+ SDValue Op0 = N->getOperand(0);
+ for (unsigned I = 0; I < NumParts; I++) {
+ SDValue OpI = N->getOperand(I);
+ if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ OpI->getOperand(0) != Op0->getOperand(0))
+ return SDValue();
+ auto *Idx = cast<ConstantSDNode>(OpI->getOperand(1));
+ if (Idx->getZExtValue() != (I * MinNumElements))
+ return SDValue();
+ }
+
+ // Normal loads are currently already handled by the InterleavedAccessPass so
+ // we don't expect to see them here. Bail out if the masked load has an
+ // unexpected number of uses, since we want to avoid a situation where we have
+ // both deinterleaving loads and normal loads in the same block. Also, discard
+ // masked loads that are extending, indexed, have an unexpected offset or have
+ // an unsupported passthru value until we find a valid use case.
+ auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
+ if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
+ MaskedLoad->getExtensionType() != ISD::NON_EXTLOAD ||
+ MaskedLoad->getAddressingMode() != ISD::UNINDEXED ||
+ !MaskedLoad->getOffset().isUndef() ||
+ (!MaskedLoad->getPassThru()->isUndef() &&
+ !isZerosVector(MaskedLoad->getPassThru().getNode())))
+ return SDValue();
+
+ // Now prove that the mask is an interleave of identical masks.
+ SDValue Mask = MaskedLoad->getMask();
+ if (Mask->getOpcode() != ISD::SPLAT_VECTOR &&
+ Mask->getOpcode() != ISD::CONCAT_VECTORS)
+ return SDValue();
+
+ // TODO: Do we need to check the element type of the mask?
+ SDValue NarrowMask;
+ SDLoc DL(N);
+ if (Mask->getOpcode() == ISD::CONCAT_VECTORS) {
+ if (Mask->getNumOperands() != NumParts)
+ return SDValue();
+
+ // We should be concatenating each sequential result from a
+ // VECTOR_INTERLEAVE.
+ SDValue InterleaveOp = Mask->getOperand(0);
+ if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
+ InterleaveOp->getNumOperands() != NumParts)
+ return SDValue();
+
+ for (unsigned I = 0; I < NumParts; I++) {
+ SDValue ConcatOp = Mask->getOperand(I);
+ if (ConcatOp.getResNo() != I ||
+ ConcatOp.getNode() != InterleaveOp.getNode())
+ return SDValue();
+ }
+
+ // Make sure the inputs to the vector interleave are identical.
+ for (unsigned I = 1; I < NumParts; I++) {
+ if (InterleaveOp->getOperand(I) != InterleaveOp->getOperand(0))
+ return SDValue();
+ }
+
+ NarrowMask = InterleaveOp->getOperand(0);
+ } else { // ISD::SPLAT_VECTOR
+ auto *SplatVal = dyn_cast<ConstantSDNode>(Mask->getOperand(0));
+ if (!SplatVal || SplatVal->getZExtValue() != 1)
+ return SDValue();
+ ElementCount EC = Mask.getValueType().getVectorElementCount();
+ assert((EC.getKnownMinValue() % NumParts) == 0 &&
+ "Expected element count divisible by number of parts");
+ EC = ElementCount::getScalable(EC.getKnownMinValue() / NumParts);
+ NarrowMask =
+ DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
+ DAG.getConstant(1, DL, MVT::i1));
+ }
+
+ const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
+ : Intrinsic::aarch64_sve_ld4_sret;
+ SDValue NewLdOps[] = {MaskedLoad->getChain(),
+ DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
+ MaskedLoad->getBasePtr()};
+ SDValue Res;
+ if (NumParts == 2)
+ Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+ {SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
+ else
+ Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+ {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
+ NewLdOps);
+
+ // We can now generate a structured load!
+ SmallVector<SDValue, 4> ResOps(NumParts);
+ for (unsigned Idx = 0; Idx < NumParts; Idx++)
+ ResOps[Idx] = SDValue(Res.getNode(), Idx);
+
+ // Replace uses of the original chain result with the new chain result.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
+ SDValue(Res.getNode(), NumParts));
+ return DCI.CombineTo(N, ResOps, false);
+}
+
/// If the operand is a bitwise AND with a constant RHS, and the shift has a
/// constant RHS and is the only use, we can pull it out of the shift, i.e.
///
@@ -27078,6 +27194,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
default:
LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
break;
+ case ISD::VECTOR_DEINTERLEAVE:
+ return performVectorDeinterleaveCombine(N, DCI, DAG);
case ISD::VECREDUCE_AND:
case ISD::VECREDUCE_OR:
case ISD::VECREDUCE_XOR:
diff --git a/llvm/test/CodeGen/AArch64/masked_deinterleaved_loads.ll b/llvm/test/CodeGen/AArch64/masked_deinterleaved_loads.ll
index 4b2032aa91e60..e7d9cb353941e 100644
--- a/llvm/test/CodeGen/AArch64/masked_deinterleaved_loads.ll
+++ b/llvm/test/CodeGen/AArch64/masked_deinterleaved_loads.ll
@@ -4,13 +4,8 @@
define <vscale x 16 x i8> @foo_ld2_nxv16i8(<vscale x 16 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 p1.b, p0.b, p0.b
-; CHECK-NEXT: zip2 p0.b, p0.b, p0.b
-; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
-; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: uzp2 z2.b, z1.b, z0.b
-; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
-; CHECK-NEXT: add z0.b, z0.b, z2.b
+; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0]
+; CHECK-NEXT: add z0.b, z0.b, z1.b
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
%wide.masked.vec = tail call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)
@@ -24,13 +19,8 @@ define <vscale x 16 x i8> @foo_ld2_nxv16i8(<vscale x 16 x i1> %mask, ptr %p) {
define <vscale x 8 x i16> @foo_ld2_nxv8i16(<vscale x 8 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_nxv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 p1.h, p0.h, p0.h
-; CHECK-NEXT: zip2 p0.h, p0.h, p0.h
-; CHECK-NEXT: ld1h { z1.h }, p1/z, [x0]
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: uzp2 z2.h, z1.h, z0.h
-; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
-; CHECK-NEXT: add z0.h, z0.h, z2.h
+; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
%wide.masked.vec = tail call <vscale x 16 x i16> @llvm.masked.load.nxv16i16.p0(ptr %p, i32 2, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x i16> poison)
@@ -44,13 +34,8 @@ define <vscale x 8 x i16> @foo_ld2_nxv8i16(<vscale x 8 x i1> %mask, ptr %p) {
define <vscale x 4 x float> @foo_ld2_nxv4f32(<vscale x 4 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_nxv4f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 p1.s, p0.s, p0.s
-; CHECK-NEXT: zip2 p0.s, p0.s, p0.s
-; CHECK-NEXT: ld1w { z1.s }, p1/z, [x0]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: uzp2 z2.s, z1.s, z0.s
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
-; CHECK-NEXT: fadd z0.s, z0.s, z2.s
+; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: fadd z0.s, z0.s, z1.s
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
%wide.masked.vec = tail call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr %p, i32 4, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x float> poison)
@@ -64,13 +49,8 @@ define <vscale x 4 x float> @foo_ld2_nxv4f32(<vscale x 4 x i1> %mask, ptr %p) {
define <vscale x 2 x double> @foo_ld2_nxv2f64(<vscale x 2 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_nxv2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 p1.d, p0.d, p0.d
-; CHECK-NEXT: zip2 p0.d, p0.d, p0.d
-; CHECK-NEXT: ld1d { z1.d }, p1/z, [x0]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: uzp2 z2.d, z1.d, z0.d
-; CHECK-NEXT: uzp1 z0.d, z1.d, z0.d
-; CHECK-NEXT: fadd z0.d, z0.d, z2.d
+; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: fadd z0.d, z0.d, z1.d
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
%wide.masked.vec = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %p, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
@@ -84,27 +64,10 @@ define <vscale x 2 x double> @foo_ld2_nxv2f64(<vscale x 2 x i1> %mask, ptr %p) {
define <vscale x 16 x i8> @foo_ld4_nxv16i8(<vscale x 16 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld4_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip2 p1.b, p0.b, p0.b
-; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
-; CHECK-NEXT: zip1 p2.b, p1.b, p1.b
-; CHECK-NEXT: zip2 p1.b, p1.b, p1.b
-; CHECK-NEXT: zip2 p3.b, p0.b, p0.b
-; CHECK-NEXT: ld1b { z3.b }, p2/z, [x0, #2, mul vl]
-; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
-; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, #3, mul vl]
-; CHECK-NEXT: ld1b { z0.b }, p3/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0]
-; CHECK-NEXT: uzp2 z4.b, z3.b, z2.b
-; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
-; CHECK-NEXT: uzp2 z5.b, z1.b, z0.b
-; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
-; CHECK-NEXT: uzp2 z1.b, z5.b, z4.b
-; CHECK-NEXT: uzp2 z3.b, z0.b, z2.b
-; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
-; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b
-; CHECK-NEXT: add z1.b, z3.b, z1.b
-; CHECK-NEXT: add z0.b, z0.b, z4.b
-; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
+; CHECK-NEXT: add z4.b, z0.b, z1.b
+; CHECK-NEXT: add z0.b, z2.b, z3.b
+; CHECK-NEXT: add z0.b, z4.b, z0.b
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
%wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 1, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
@@ -122,27 +85,10 @@ define <vscale x 16 x i8> @foo_ld4_nxv16i8(<vscale x 16 x i1> %mask, ptr %p) {
define <vscale x 8 x i16> @foo_ld4_nxv8i16(<vscale x 8 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld4_nxv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip2 p1.h, p0.h, p0.h
-; CHECK-NEXT: zip1 p0.h, p0.h, p0.h
-; CHECK-NEXT: zip1 p2.h, p1.h, p1.h
-; CHECK-NEXT: zip2 p1.h, p1.h, p1.h
-; CHECK-NEXT: zip2 p3.h, p0.h, p0.h
-; CHECK-NEXT: ld1h { z3.h }, p2/z, [x0, #2, mul vl]
-; CHECK-NEXT: zip1 p0.h, p0.h, p0.h
-; CHECK-NEXT: ld1h { z2.h }, p1/z, [x0, #3, mul vl]
-; CHECK-NEXT: ld1h { z0.h }, p3/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
-; CHECK-NEXT: uzp2 z4.h, z3.h, z2.h
-; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h
-; CHECK-NEXT: uzp2 z5.h, z1.h, z0.h
-; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
-; CHECK-NEXT: uzp2 z1.h, z5.h, z4.h
-; CHECK-NEXT: uzp2 z3.h, z0.h, z2.h
-; CHECK-NEXT: uzp1 z4.h, z5.h, z4.h
-; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT: add z1.h, z3.h, z1.h
-; CHECK-NEXT: add z0.h, z0.h, z4.h
-; CHECK-NEXT: add z0.h, z0.h, z1.h
+; CHECK-NEXT: ld4h { z0.h - z3.h }, p0/z, [x0]
+; CHECK-NEXT: add z4.h, z0.h, z1.h
+; CHECK-NEXT: add z0.h, z2.h, z3.h
+; CHECK-NEXT: add z0.h, z4.h, z0.h
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
%wide.masked.vec = tail call <vscale x 32 x i16> @llvm.masked.load.nxv32i16.p0(ptr %p, i32 2, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i16> poison)
@@ -160,27 +106,10 @@ define <vscale x 8 x i16> @foo_ld4_nxv8i16(<vscale x 8 x i1> %mask, ptr %p) {
define <vscale x 4 x float> @foo_ld4_nxv4f32(<vscale x 4 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld4_nxv4f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip2 p1.s, p0.s, p0.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p0.s
-; CHECK-NEXT: zip1 p2.s, p1.s, p1.s
-; CHECK-NEXT: zip2 p1.s, p1.s, p1.s
-; CHECK-NEXT: zip2 p3.s, p0.s, p0.s
-; CHECK-NEXT: ld1w { z3.s }, p2/z, [x0, #2, mul vl]
-; CHECK-NEXT: zip1 p0.s, p0.s, p0.s
-; CHECK-NEXT: ld1w { z2.s }, p1/z, [x0, #3, mul vl]
-; CHECK-NEXT: ld1w { z0.s }, p3/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
-; CHECK-NEXT: uzp2 z4.s, z3.s, z2.s
-; CHECK-NEXT: uzp1 z2.s, z3.s, z2.s
-; CHECK-NEXT: uzp2 z5.s, z1.s, z0.s
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
-; CHECK-NEXT: uzp2 z1.s, z5.s, z4.s
-; CHECK-NEXT: uzp2 z3.s, z0.s, z2.s
-; CHECK-NEXT: uzp1 z4.s, z5.s, z4.s
-; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s
-; CHECK-NEXT: fadd z1.s, z3.s, z1.s
-; CHECK-NEXT: fadd z0.s, z0.s, z4.s
-; CHECK-NEXT: fadd z0.s, z0.s, z1.s
+; CHECK-NEXT: ld4w { z0.s - z3.s }, p0/z, [x0]
+; CHECK-NEXT: fadd z4.s, z0.s, z1.s
+; CHECK-NEXT: fadd z0.s, z2.s, z3.s
+; CHECK-NEXT: fadd z0.s, z4.s, z0.s
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
%wide.masked.vec = tail call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr %p, i32 4, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x float> poison)
@@ -198,27 +127,10 @@ define <vscale x 4 x float> @foo_ld4_nxv4f32(<vscale x 4 x i1> %mask, ptr %p) {
define <vscale x 2 x double> @foo_ld4_nxv2f64(<vscale x 2 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld4_nxv2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip2 p1.d, p0.d, p0.d
-; CHECK-NEXT: zip1 p0.d, p0.d, p0.d
-; CHECK-NEXT: zip1 p2.d, p1.d, p1.d
-; CHECK-NEXT: zip2 p1.d, p1.d, p1.d
-; CHECK-NEXT: zip2 p3.d, p0.d, p0.d
-; CHECK-NEXT: ld1d { z3.d }, p2/z, [x0, #2, mul vl]
-; CHECK-NEXT: zip1 p0.d, p0.d, p0.d
-; CHECK-NEXT: ld1d { z2.d }, p1/z, [x0, #3, mul vl]
-; CHECK-NEXT: ld1d { z0.d }, p3/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
-; CHECK-NEXT: uzp2 z4.d, z3.d, z2.d
-; CHECK-NEXT: uzp1 z2.d, z3.d, z2.d
-; CHECK-NEXT: uzp2 z5.d, z1.d, z0.d
-; CHECK-NEXT: uzp1 z0.d, z1.d, z0.d
-; CHECK-NEXT: uzp2 z1.d, z5.d, z4.d
-; CHECK-NEXT: uzp2 z3.d, z0.d, z2.d
-; CHECK-NEXT: uzp1 z4.d, z5.d, z4.d
-; CHECK-NEXT: uzp1 z0.d, z0.d, z2.d
-; CHECK-NEXT: fadd z1.d, z3.d, z1.d
-; CHECK-NEXT: fadd z0.d, z0.d, z4.d
-; CHECK-NEXT: fadd z0.d, z0.d, z1.d
+; CHECK-NEXT: ld4d { z0.d - z3.d }, p0/z, [x0]
+; CHECK-NEXT: fadd z4.d, z0.d, z1.d
+; CHECK-NEXT: fadd z0.d, z2.d, z3.d
+; CHECK-NEXT: fadd z0.d, z4.d, z0.d
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
%wide.masked.vec = tail call <vscale x 8 x double> @llvm.masked.load.nxv8f64.p0(ptr %p, i32 8, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x double> poison)
@@ -237,40 +149,21 @@ define <vscale x 2 x double> @foo_ld4_nxv2f64(<vscale x 2 x i1> %mask, ptr %p) {
define <vscale x 16 x i8> @foo_ld4_nxv16i8_mul_use_of_mask(<vscale x 16 x i1> %mask, ptr %p, ptr %p2) {
; CHECK-LABEL: foo_ld4_nxv16i8_mul_use_of_mask:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
; CHECK-NEXT: zip2 p1.b, p0.b, p0.b
-; CHECK-NEXT: zip1 p4.b, p0.b, p0.b
-; CHECK-NEXT: zip1 p0.b, p1.b, p1.b
+; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
; CHECK-NEXT: zip2 p2.b, p1.b, p1.b
-; CHECK-NEXT: zip2 p3.b, p4.b, p4.b
-; CHECK-NEXT: ld1b { z3.b }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: zip1 p1.b, p4.b, p4.b
-; CHECK-NEXT: ld1b { z2.b }, p2/z, [x0, #3, mul vl]
-; CHECK-NEXT: ld1b { z0.b }, p3/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
-; CHECK-NEXT: uzp2 z4.b, z3.b, z2.b
-; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
-; CHECK-NEXT: uzp2 z5.b, z1.b, z0.b
-; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
-; CHECK-NEXT: uzp2 z3.b, z0.b, z2.b
-; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b
-; CHECK-NEXT: movi v2.2d, #0000000000000000
-; CHECK-NEXT: uzp2 z1.b, z5.b, z4.b
-; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
-; CHECK-NEXT: add z0.b, z0.b, z4.b
-; CHECK-NEXT: add z1.b, z3.b, z1.b
-; CHECK-NEXT: st1b { z2.b }, p2, [x1, #3, mul vl]
-; CHECK-NEXT: st1b { z2.b }, p0, [x1, #2, mul vl]
-; CHECK-NEXT: st1b { z2.b }, p3, [x1, #1, mul vl]
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: add z0.b, z0.b, z1.b
-; CHECK-NEXT: st1b { z2.b }, p1, [x1]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: zip1 p1.b, p1.b, p1.b
+; CHECK-NEXT: zip2 p3.b, p0.b, p0.b
+; CHECK-NEXT: add z4.b, z0.b, z1.b
+; CHECK-NEXT: add z0.b, z2.b, z3.b
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
+; CHECK-NEXT: add z0.b, z4.b, z0.b
+; CHECK-NEXT: st1b { z1.b }, p2, [x1, #3, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p3, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p0, [x1]
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
%wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
@@ -289,21 +182,11 @@ define <vscale x 16 x i8> @foo_ld4_nxv16i8_mul_use_of_mask(<vscale x 16 x i1> %m
define <vscale x 16 x i8> @foo_ld4_nxv16i8_mask_of_interleaved_ones(ptr %p) {
; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_interleaved_ones:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-NEXT: ldr z1, [x0]
-; CHECK-NEXT: ldr z2, [x0, #3, mul vl]
-; CHECK-NEXT: ldr z3, [x0, #2, mul vl]
-; CHECK-NEXT: uzp2 z5.b, z1.b, z0.b
-; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
-; CHECK-NEXT: uzp2 z4.b, z3.b, z2.b
-; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
-; CHECK-NEXT: uzp2 z1.b, z5.b, z4.b
-; CHECK-NEXT: uzp2 z3.b, z0.b, z2.b
-; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
-; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b
-; CHECK-NEXT: add z1.b, z3.b, z1.b
-; CHECK-NEXT: add z0.b, z0.b, z4.b
-; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
+; CHECK-NEXT: add z4.b, z0.b, z1.b
+; CHECK-NEXT: add z0.b, z2.b, z3.b
+; CHECK-NEXT: add z0.b, z4.b, z0.b
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1))
%wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
@@ -321,21 +204,11 @@ define <vscale x 16 x i8> @foo_ld4_nxv16i8_mask_of_interleaved_ones(ptr %p) {
define <vscale x 16 x i8> @foo_ld4_nxv16i8_mask_of_ones(ptr %p) {
; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_ones:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-NEXT: ldr z1, [x0]
-; CHECK-NEXT: ldr z2, [x0, #3, mul vl]
-; CHECK-NEXT: ldr z3, [x0, #2, mul vl]
-; CHECK-NEXT: uzp2 z5.b, z1.b, z0.b
-; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
-; CHECK-NEXT: uzp2 z4.b, z3.b, z2.b
-; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
-; CHECK-NEXT: uzp2 z1.b, z5.b, z4.b
-; CHECK-NEXT: uzp2 z3.b, z0.b, z2.b
-; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
-; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b
-; CHECK-NEXT: add z1.b, z3.b, z1.b
-; CHECK-NEXT: add z0.b, z0.b, z4.b
-; CHECK-NEXT: add z0.b, z0.b, z1.b
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
+; CHECK-NEXT: add z4.b, z0.b, z1.b
+; CHECK-NEXT: add z0.b, z2.b, z3.b
+; CHECK-NEXT: add z0.b, z4.b, z0.b
; CHECK-NEXT: ret
%wide.masked.vec = tail call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr %p, i32 4, <vscale x 64 x i1> splat(i1 1), <vscale x 64 x i8> poison)
%strided.vec = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
More information about the llvm-commits
mailing list