[llvm] [SelectionDAG] Fix unsafe cases for loop.dependence.{war/raw}.mask (PR #168565)
Sam Tebbs via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 18 08:46:49 PST 2025
https://github.com/SamTebbs33 created https://github.com/llvm/llvm-project/pull/168565
There is an unsafe case with the loop dependence mask intrinsics where the difference between the two pointers is less than or equal to half the vector length, e.g. ptrA = 0 and ptrB 3 when the vector length is 32. Currently that produces a correct low-mask with 3 active lanes and an incorrect high mask with all lanes active. This PR adds a select on the high mask which guards against this case.
>From 0a8f1f191436e7be393574b61dc85074cd7585b3 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 17 Nov 2025 17:30:30 +0000
Subject: [PATCH] [SelectionDAG] Fix unsafe cases for
loop.dependence.{war/raw}.mask
There is an unsafe case with the loop dependence mask intrinsics where
the difference between the two pointers is less than half the vector
length, e.g. ptrA = 0 and ptrB 3 when the vector length is 32. Currently that
produces a correct low-mask with 3 active lanes and an incorrect high mask with
all lanes active. This PR adds a select on the high mask which guards
against this case.
---
.../SelectionDAG/LegalizeVectorTypes.cpp | 19 +
llvm/test/CodeGen/AArch64/alias_mask.ll | 440 ++++++++++--------
.../CodeGen/AArch64/alias_mask_scalable.ll | 288 +++++++-----
3 files changed, 425 insertions(+), 322 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 71eeee78bd868..c8d66cc21244f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1692,6 +1692,18 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
}
+/// Split a loop dependence mask.
+/// This is done by creating a high and low mask, each of half the vector
+/// length. A select of the high mask and a predicate of all zeroes is needed to
+/// guarantee that the high mask is safe. A case where simply producing a high
+/// mask without the select is unsafe, is when the difference between the two
+/// pointers is less than half the vector length, e.g. ptrA = 0 and ptrB 3 when
+/// the vector length is 32.
+/// The full 32xi1 mask should be three active lanes and the rest inactive,
+/// however when half the vector length is added to ptrA to produce the high
+/// mask, the difference between ptrA and ptrB is now -13, which will result
+/// in a mask with all lanes active. The select will guard against this case
+/// by choosing a mask of all inactive lanes when ptrA + VL/2 >= ptrB.
void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDLoc DL(N);
@@ -1708,7 +1720,14 @@ void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo,
: DAG.getConstant(Offset, DL, MVT::i64);
PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend);
+ EVT CmpVT = MVT::i1;
+ SDValue Cmp = DAG.getSetCC(DL, CmpVT, PtrA, PtrB, ISD::CondCode::SETUGE);
+ Cmp = DAG.getSplat(EVT::getVectorVT(*DAG.getContext(), CmpVT,
+ HiVT.getVectorMinNumElements(),
+ HiVT.isScalableVT()),
+ DL, Cmp);
Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB, N->getOperand(2));
+ Hi = DAG.getSelect(DL, HiVT, Cmp, DAG.getConstant(0, DL, HiVT), Hi);
}
void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll
index fdd0a6a4709da..da14e17bf2463 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask.ll
@@ -101,26 +101,30 @@ define <32 x i1> @whilewr_8_split(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_8_split:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: add x9, x0, #16
-; CHECK-NEXT: whilewr p0.b, x0, x1
-; CHECK-NEXT: whilewr p1.b, x9, x1
+; CHECK-NEXT: cmp x9, x1
+; CHECK-NEXT: cset w10, hs
+; CHECK-NEXT: whilewr p0.b, x9, x1
; CHECK-NEXT: adrp x9, .LCPI8_0
-; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: dup v0.16b, w10
+; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: whilewr p0.b, x0, x1
+; CHECK-NEXT: mov z2.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: shl v1.16b, v2.16b, #7
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_0]
-; CHECK-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-NEXT: shl v1.16b, v1.16b, #7
-; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: zip1 v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: zip1 v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b
; CHECK-NEXT: addv h1, v1.8h
-; CHECK-NEXT: str h0, [x8]
-; CHECK-NEXT: str h1, [x8, #2]
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: str h1, [x8]
+; CHECK-NEXT: str h0, [x8, #2]
; CHECK-NEXT: ret
entry:
%0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 1)
@@ -131,46 +135,61 @@ define <64 x i1> @whilewr_8_split2(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_8_split2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: add x9, x0, #48
-; CHECK-NEXT: whilewr p0.b, x0, x1
; CHECK-NEXT: add x10, x0, #16
-; CHECK-NEXT: whilewr p1.b, x9, x1
-; CHECK-NEXT: add x9, x0, #32
-; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: whilewr p0.b, x9, x1
+; CHECK-NEXT: cmp x9, x1
+; CHECK-NEXT: cset w9, hs
+; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: dup v0.16b, w9
+; CHECK-NEXT: add x9, x0, #32
+; CHECK-NEXT: cmp x9, x1
+; CHECK-NEXT: cset w11, hs
+; CHECK-NEXT: cmp x10, x1
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: cset w12, hs
+; CHECK-NEXT: whilewr p1.b, x9, x1
+; CHECK-NEXT: whilewr p0.b, x10, x1
+; CHECK-NEXT: dup v2.16b, w11
+; CHECK-NEXT: dup v5.16b, w12
+; CHECK-NEXT: mov z3.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: whilewr p1.b, x0, x1
; CHECK-NEXT: adrp x9, .LCPI9_0
+; CHECK-NEXT: cmge v0.16b, v0.16b, #0
+; CHECK-NEXT: mov z4.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: bic v3.16b, v3.16b, v2.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: whilewr p1.b, x10, x1
+; CHECK-NEXT: bic v4.16b, v4.16b, v5.16b
+; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: shl v1.16b, v1.16b, #7
+; CHECK-NEXT: shl v2.16b, v3.16b, #7
+; CHECK-NEXT: shl v3.16b, v4.16b, #7
; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI9_0]
-; CHECK-NEXT: mov z2.b, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z3.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-NEXT: shl v1.16b, v1.16b, #7
-; CHECK-NEXT: shl v2.16b, v2.16b, #7
-; CHECK-NEXT: shl v3.16b, v3.16b, #7
-; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
; CHECK-NEXT: cmlt v2.16b, v2.16b, #0
; CHECK-NEXT: cmlt v3.16b, v3.16b, #0
-; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
; CHECK-NEXT: and v2.16b, v2.16b, v4.16b
; CHECK-NEXT: and v3.16b, v3.16b, v4.16b
-; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: ext v6.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT: ext v7.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT: zip1 v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT: ext v7.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: zip1 v1.16b, v1.16b, v5.16b
-; CHECK-NEXT: zip1 v2.16b, v2.16b, v6.16b
-; CHECK-NEXT: zip1 v3.16b, v3.16b, v7.16b
-; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: zip1 v2.16b, v2.16b, v4.16b
+; CHECK-NEXT: zip1 v3.16b, v3.16b, v6.16b
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v7.16b
; CHECK-NEXT: addv h1, v1.8h
; CHECK-NEXT: addv h2, v2.8h
; CHECK-NEXT: addv h3, v3.8h
-; CHECK-NEXT: str h0, [x8]
-; CHECK-NEXT: str h1, [x8, #6]
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: str h1, [x8]
; CHECK-NEXT: str h2, [x8, #4]
; CHECK-NEXT: str h3, [x8, #2]
+; CHECK-NEXT: str h0, [x8, #6]
; CHECK-NEXT: ret
entry:
%0 = call <64 x i1> @llvm.loop.dependence.war.mask.v64i1(ptr %a, ptr %b, i64 1)
@@ -227,69 +246,74 @@ entry:
define <32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_16_expand2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub x9, x1, x0
; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: sub x10, x9, #32
-; CHECK-NEXT: add x9, x9, x9, lsr #63
+; CHECK-NEXT: add x9, x0, #32
+; CHECK-NEXT: sub x10, x1, x0
+; CHECK-NEXT: subs x9, x1, x9
; CHECK-NEXT: add x10, x10, x10, lsr #63
-; CHECK-NEXT: asr x9, x9, #1
-; CHECK-NEXT: asr x10, x10, #1
+; CHECK-NEXT: add x11, x9, x9, lsr #63
+; CHECK-NEXT: asr x9, x10, #1
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z4.d, z0.d
; CHECK-NEXT: mov z5.d, z0.d
-; CHECK-NEXT: mov z6.d, z0.d
-; CHECK-NEXT: dup v7.2d, x9
-; CHECK-NEXT: dup v16.2d, x10
+; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: mov z16.d, z0.d
+; CHECK-NEXT: mov z18.d, z0.d
+; CHECK-NEXT: asr x10, x11, #1
+; CHECK-NEXT: dup v3.2d, x9
; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc
; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa
+; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8
+; CHECK-NEXT: dup v6.2d, x10
+; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6
+; CHECK-NEXT: add z7.d, z7.d, #4 // =0x4
+; CHECK-NEXT: add z18.d, z18.d, #14 // =0xe
+; CHECK-NEXT: add z16.d, z16.d, #2 // =0x2
+; CHECK-NEXT: cmhi v17.2d, v3.2d, v0.2d
+; CHECK-NEXT: cmhi v19.2d, v3.2d, v1.2d
+; CHECK-NEXT: cmhi v20.2d, v3.2d, v2.2d
+; CHECK-NEXT: cset w11, ls
+; CHECK-NEXT: cmhi v0.2d, v6.2d, v0.2d
+; CHECK-NEXT: cmhi v1.2d, v6.2d, v1.2d
+; CHECK-NEXT: cmhi v2.2d, v6.2d, v2.2d
+; CHECK-NEXT: cmhi v21.2d, v6.2d, v4.2d
+; CHECK-NEXT: cmhi v22.2d, v6.2d, v18.2d
+; CHECK-NEXT: cmhi v23.2d, v6.2d, v5.2d
+; CHECK-NEXT: cmhi v24.2d, v6.2d, v7.2d
+; CHECK-NEXT: cmhi v6.2d, v6.2d, v16.2d
+; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d
+; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d
+; CHECK-NEXT: cmhi v18.2d, v3.2d, v18.2d
; CHECK-NEXT: cmp x10, #1
-; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8
-; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d
-; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d
-; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d
-; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d
-; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d
-; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d
-; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d
-; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d
-; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d
-; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d
+; CHECK-NEXT: cmhi v7.2d, v3.2d, v7.2d
+; CHECK-NEXT: cmhi v3.2d, v3.2d, v16.2d
; CHECK-NEXT: cset w10, lt
-; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d
-; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s
+; CHECK-NEXT: uzp1 v1.4s, v1.4s, v22.4s
+; CHECK-NEXT: uzp1 v2.4s, v21.4s, v2.4s
; CHECK-NEXT: cmp x9, #1
-; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s
-; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s
+; CHECK-NEXT: uzp1 v16.4s, v24.4s, v23.4s
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v6.4s
; CHECK-NEXT: cset w9, lt
-; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s
-; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s
-; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s
-; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h
-; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h
-; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h
-; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h
-; CHECK-NEXT: dup v2.16b, w9
+; CHECK-NEXT: uzp1 v6.4s, v19.4s, v18.4s
+; CHECK-NEXT: uzp1 v4.4s, v4.4s, v20.4s
+; CHECK-NEXT: uzp1 v5.4s, v7.4s, v5.4s
+; CHECK-NEXT: uzp1 v3.4s, v17.4s, v3.4s
+; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v16.8h
+; CHECK-NEXT: uzp1 v2.8h, v4.8h, v6.8h
+; CHECK-NEXT: uzp1 v3.8h, v3.8h, v5.8h
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: dup v1.16b, w10
+; CHECK-NEXT: uzp1 v2.16b, v3.16b, v2.16b
+; CHECK-NEXT: dup v3.16b, w9
; CHECK-NEXT: adrp x9, .LCPI11_0
-; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: dup v3.16b, w10
-; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: dup v1.16b, w11
+; CHECK-NEXT: orr v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: shl v1.16b, v2.16b, #7
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI11_0]
-; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b
-; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
@@ -393,85 +417,89 @@ entry:
define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_32_expand3:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub x10, x1, x0
+; CHECK-NEXT: add x9, x0, #64
; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: sub x9, x10, #61
-; CHECK-NEXT: subs x11, x10, #64
-; CHECK-NEXT: add x12, x10, #3
-; CHECK-NEXT: csel x9, x9, x11, mi
+; CHECK-NEXT: subs x9, x1, x9
+; CHECK-NEXT: add x10, x9, #3
+; CHECK-NEXT: csel x9, x10, x9, mi
; CHECK-NEXT: asr x11, x9, #2
+; CHECK-NEXT: cset w9, ls
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov z3.d, z0.d
-; CHECK-NEXT: cmp x11, #1
; CHECK-NEXT: mov z4.d, z0.d
-; CHECK-NEXT: mov z5.d, z0.d
-; CHECK-NEXT: cset w9, lt
-; CHECK-NEXT: cmp x10, #0
-; CHECK-NEXT: mov z6.d, z0.d
-; CHECK-NEXT: csel x10, x12, x10, mi
-; CHECK-NEXT: dup v7.2d, x11
+; CHECK-NEXT: cmp x11, #1
+; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: mov z16.d, z0.d
+; CHECK-NEXT: mov z17.d, z0.d
+; CHECK-NEXT: cset w10, lt
+; CHECK-NEXT: subs x12, x1, x0
+; CHECK-NEXT: add x13, x12, #3
+; CHECK-NEXT: dup v5.2d, x11
; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT: asr x10, x10, #2
+; CHECK-NEXT: csel x12, x13, x12, mi
; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa
; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8
; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT: dup v16.2d, x10
-; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d
-; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT: cmp x10, #1
-; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT: cset w10, lt
-; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d
-; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d
-; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d
-; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d
-; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT: cmhi v23.2d, v16.2d, v5.2d
-; CHECK-NEXT: cmhi v24.2d, v16.2d, v6.2d
-; CHECK-NEXT: cmhi v5.2d, v7.2d, v5.2d
-; CHECK-NEXT: cmhi v16.2d, v16.2d, v0.2d
-; CHECK-NEXT: cmhi v6.2d, v7.2d, v6.2d
-; CHECK-NEXT: cmhi v0.2d, v7.2d, v0.2d
-; CHECK-NEXT: uzp1 v7.4s, v21.4s, v20.4s
+; CHECK-NEXT: add z7.d, z7.d, #4 // =0x4
+; CHECK-NEXT: add z17.d, z17.d, #14 // =0xe
+; CHECK-NEXT: add z16.d, z16.d, #2 // =0x2
+; CHECK-NEXT: asr x12, x12, #2
+; CHECK-NEXT: cmhi v18.2d, v5.2d, v0.2d
+; CHECK-NEXT: cmhi v19.2d, v5.2d, v1.2d
+; CHECK-NEXT: cmhi v20.2d, v5.2d, v2.2d
+; CHECK-NEXT: cmhi v21.2d, v5.2d, v3.2d
+; CHECK-NEXT: dup v6.2d, x12
+; CHECK-NEXT: cmhi v22.2d, v5.2d, v4.2d
+; CHECK-NEXT: cmhi v23.2d, v5.2d, v7.2d
+; CHECK-NEXT: cmhi v24.2d, v5.2d, v17.2d
+; CHECK-NEXT: cmhi v5.2d, v5.2d, v16.2d
+; CHECK-NEXT: cmp x12, #1
+; CHECK-NEXT: cmhi v0.2d, v6.2d, v0.2d
+; CHECK-NEXT: cmhi v1.2d, v6.2d, v1.2d
+; CHECK-NEXT: cmhi v2.2d, v6.2d, v2.2d
+; CHECK-NEXT: cmhi v3.2d, v6.2d, v3.2d
+; CHECK-NEXT: cmhi v4.2d, v6.2d, v4.2d
+; CHECK-NEXT: cmhi v17.2d, v6.2d, v17.2d
+; CHECK-NEXT: cmhi v7.2d, v6.2d, v7.2d
+; CHECK-NEXT: cmhi v6.2d, v6.2d, v16.2d
+; CHECK-NEXT: uzp1 v16.4s, v19.4s, v24.4s
+; CHECK-NEXT: uzp1 v19.4s, v21.4s, v20.4s
+; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s
+; CHECK-NEXT: uzp1 v5.4s, v18.4s, v5.4s
+; CHECK-NEXT: uzp1 v1.4s, v1.4s, v17.4s
; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: uzp1 v3.4s, v23.4s, v4.4s
-; CHECK-NEXT: uzp1 v4.4s, v18.4s, v24.4s
-; CHECK-NEXT: uzp1 v5.4s, v5.4s, v22.4s
-; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s
-; CHECK-NEXT: uzp1 v6.4s, v17.4s, v6.4s
-; CHECK-NEXT: uzp1 v0.4s, v19.4s, v0.4s
-; CHECK-NEXT: uzp1 v3.8h, v4.8h, v3.8h
+; CHECK-NEXT: uzp1 v3.4s, v7.4s, v4.4s
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v6.4s
+; CHECK-NEXT: uzp1 v4.8h, v19.8h, v16.8h
+; CHECK-NEXT: uzp1 v5.8h, v5.8h, v20.8h
; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT: uzp1 v2.8h, v6.8h, v5.8h
-; CHECK-NEXT: uzp1 v0.8h, v7.8h, v0.8h
-; CHECK-NEXT: uzp1 v1.16b, v3.16b, v1.16b
-; CHECK-NEXT: uzp1 v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v3.8h
; CHECK-NEXT: dup v3.16b, w10
-; CHECK-NEXT: dup v2.16b, w9
+; CHECK-NEXT: cset w10, lt
+; CHECK-NEXT: uzp1 v2.16b, v5.16b, v4.16b
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: dup v1.16b, w10
+; CHECK-NEXT: orr v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: dup v3.16b, w9
; CHECK-NEXT: adrp x9, .LCPI14_0
-; CHECK-NEXT: orr v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: bic v1.16b, v2.16b, v3.16b
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_0]
-; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b
-; CHECK-NEXT: addv h1, v1.8h
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: zip1 v1.16b, v1.16b, v3.16b
; CHECK-NEXT: addv h0, v0.8h
-; CHECK-NEXT: str h1, [x8]
-; CHECK-NEXT: str h0, [x8, #2]
+; CHECK-NEXT: addv h1, v1.8h
+; CHECK-NEXT: str h0, [x8]
+; CHECK-NEXT: str h1, [x8, #2]
; CHECK-NEXT: ret
entry:
%0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 4)
@@ -587,85 +615,89 @@ entry:
define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_64_expand4:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub x10, x1, x0
+; CHECK-NEXT: add x9, x0, #128
; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: sub x9, x10, #121
-; CHECK-NEXT: subs x11, x10, #128
-; CHECK-NEXT: add x12, x10, #7
-; CHECK-NEXT: csel x9, x9, x11, mi
+; CHECK-NEXT: subs x9, x1, x9
+; CHECK-NEXT: add x10, x9, #7
+; CHECK-NEXT: csel x9, x10, x9, mi
; CHECK-NEXT: asr x11, x9, #3
+; CHECK-NEXT: cset w9, ls
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov z3.d, z0.d
-; CHECK-NEXT: cmp x11, #1
; CHECK-NEXT: mov z4.d, z0.d
-; CHECK-NEXT: mov z5.d, z0.d
-; CHECK-NEXT: cset w9, lt
-; CHECK-NEXT: cmp x10, #0
-; CHECK-NEXT: mov z6.d, z0.d
-; CHECK-NEXT: csel x10, x12, x10, mi
-; CHECK-NEXT: dup v7.2d, x11
+; CHECK-NEXT: cmp x11, #1
+; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: mov z16.d, z0.d
+; CHECK-NEXT: mov z17.d, z0.d
+; CHECK-NEXT: cset w10, lt
+; CHECK-NEXT: subs x12, x1, x0
+; CHECK-NEXT: add x13, x12, #7
+; CHECK-NEXT: dup v5.2d, x11
; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT: asr x10, x10, #3
+; CHECK-NEXT: csel x12, x13, x12, mi
; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa
; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8
; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT: dup v16.2d, x10
-; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d
-; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT: cmp x10, #1
-; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT: cset w10, lt
-; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d
-; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d
-; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d
-; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d
-; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT: cmhi v23.2d, v16.2d, v5.2d
-; CHECK-NEXT: cmhi v24.2d, v16.2d, v6.2d
-; CHECK-NEXT: cmhi v5.2d, v7.2d, v5.2d
-; CHECK-NEXT: cmhi v16.2d, v16.2d, v0.2d
-; CHECK-NEXT: cmhi v6.2d, v7.2d, v6.2d
-; CHECK-NEXT: cmhi v0.2d, v7.2d, v0.2d
-; CHECK-NEXT: uzp1 v7.4s, v21.4s, v20.4s
+; CHECK-NEXT: add z7.d, z7.d, #4 // =0x4
+; CHECK-NEXT: add z17.d, z17.d, #14 // =0xe
+; CHECK-NEXT: add z16.d, z16.d, #2 // =0x2
+; CHECK-NEXT: asr x12, x12, #3
+; CHECK-NEXT: cmhi v18.2d, v5.2d, v0.2d
+; CHECK-NEXT: cmhi v19.2d, v5.2d, v1.2d
+; CHECK-NEXT: cmhi v20.2d, v5.2d, v2.2d
+; CHECK-NEXT: cmhi v21.2d, v5.2d, v3.2d
+; CHECK-NEXT: dup v6.2d, x12
+; CHECK-NEXT: cmhi v22.2d, v5.2d, v4.2d
+; CHECK-NEXT: cmhi v23.2d, v5.2d, v7.2d
+; CHECK-NEXT: cmhi v24.2d, v5.2d, v17.2d
+; CHECK-NEXT: cmhi v5.2d, v5.2d, v16.2d
+; CHECK-NEXT: cmp x12, #1
+; CHECK-NEXT: cmhi v0.2d, v6.2d, v0.2d
+; CHECK-NEXT: cmhi v1.2d, v6.2d, v1.2d
+; CHECK-NEXT: cmhi v2.2d, v6.2d, v2.2d
+; CHECK-NEXT: cmhi v3.2d, v6.2d, v3.2d
+; CHECK-NEXT: cmhi v4.2d, v6.2d, v4.2d
+; CHECK-NEXT: cmhi v17.2d, v6.2d, v17.2d
+; CHECK-NEXT: cmhi v7.2d, v6.2d, v7.2d
+; CHECK-NEXT: cmhi v6.2d, v6.2d, v16.2d
+; CHECK-NEXT: uzp1 v16.4s, v19.4s, v24.4s
+; CHECK-NEXT: uzp1 v19.4s, v21.4s, v20.4s
+; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s
+; CHECK-NEXT: uzp1 v5.4s, v18.4s, v5.4s
+; CHECK-NEXT: uzp1 v1.4s, v1.4s, v17.4s
; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: uzp1 v3.4s, v23.4s, v4.4s
-; CHECK-NEXT: uzp1 v4.4s, v18.4s, v24.4s
-; CHECK-NEXT: uzp1 v5.4s, v5.4s, v22.4s
-; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s
-; CHECK-NEXT: uzp1 v6.4s, v17.4s, v6.4s
-; CHECK-NEXT: uzp1 v0.4s, v19.4s, v0.4s
-; CHECK-NEXT: uzp1 v3.8h, v4.8h, v3.8h
+; CHECK-NEXT: uzp1 v3.4s, v7.4s, v4.4s
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v6.4s
+; CHECK-NEXT: uzp1 v4.8h, v19.8h, v16.8h
+; CHECK-NEXT: uzp1 v5.8h, v5.8h, v20.8h
; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT: uzp1 v2.8h, v6.8h, v5.8h
-; CHECK-NEXT: uzp1 v0.8h, v7.8h, v0.8h
-; CHECK-NEXT: uzp1 v1.16b, v3.16b, v1.16b
-; CHECK-NEXT: uzp1 v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v3.8h
; CHECK-NEXT: dup v3.16b, w10
-; CHECK-NEXT: dup v2.16b, w9
+; CHECK-NEXT: cset w10, lt
+; CHECK-NEXT: uzp1 v2.16b, v5.16b, v4.16b
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: dup v1.16b, w10
+; CHECK-NEXT: orr v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: dup v3.16b, w9
; CHECK-NEXT: adrp x9, .LCPI18_0
-; CHECK-NEXT: orr v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: bic v1.16b, v2.16b, v3.16b
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI18_0]
-; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b
-; CHECK-NEXT: addv h1, v1.8h
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: zip1 v1.16b, v1.16b, v3.16b
; CHECK-NEXT: addv h0, v0.8h
-; CHECK-NEXT: str h1, [x8]
-; CHECK-NEXT: str h0, [x8, #2]
+; CHECK-NEXT: addv h1, v1.8h
+; CHECK-NEXT: str h0, [x8]
+; CHECK-NEXT: str h1, [x8, #2]
; CHECK-NEXT: ret
entry:
%0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 8)
diff --git a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
index 3435ceca28e17..b9a9484a33e7b 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
@@ -84,9 +84,15 @@ entry:
define <vscale x 32 x i1> @whilewr_8_split(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_8_split:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: incb x8
+; CHECK-NEXT: cmp x8, x1
+; CHECK-NEXT: cset w9, hs
+; CHECK-NEXT: whilewr p0.b, x8, x1
+; CHECK-NEXT: sbfx x8, x9, #0, #1
+; CHECK-NEXT: whilelo p1.b, xzr, x8
+; CHECK-NEXT: bic p1.b, p0/z, p0.b, p1.b
; CHECK-NEXT: whilewr p0.b, x0, x1
-; CHECK-NEXT: incb x0
-; CHECK-NEXT: whilewr p1.b, x0, x1
; CHECK-NEXT: ret
entry:
%0 = call <vscale x 32 x i1> @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 1)
@@ -96,14 +102,40 @@ entry:
define <vscale x 64 x i1> @whilewr_8_split2(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_8_split2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: mov x9, x0
+; CHECK-NEXT: mov x10, x0
+; CHECK-NEXT: rdvl x8, #3
+; CHECK-NEXT: incb x9
+; CHECK-NEXT: incb x10, all, mul #2
+; CHECK-NEXT: add x8, x0, x8
+; CHECK-NEXT: cmp x9, x1
+; CHECK-NEXT: cset w11, hs
+; CHECK-NEXT: whilewr p0.b, x9, x1
+; CHECK-NEXT: sbfx x11, x11, #0, #1
+; CHECK-NEXT: whilelo p1.b, xzr, x11
+; CHECK-NEXT: cmp x10, x1
+; CHECK-NEXT: cset w9, hs
+; CHECK-NEXT: whilewr p2.b, x10, x1
+; CHECK-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NEXT: bic p1.b, p0/z, p0.b, p1.b
+; CHECK-NEXT: whilelo p3.b, xzr, x9
+; CHECK-NEXT: cmp x8, x1
+; CHECK-NEXT: cset w9, hs
+; CHECK-NEXT: whilewr p4.b, x8, x1
+; CHECK-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NEXT: bic p2.b, p2/z, p2.b, p3.b
+; CHECK-NEXT: whilelo p0.b, xzr, x9
+; CHECK-NEXT: bic p4.b, p4/z, p4.b, p0.b
; CHECK-NEXT: whilewr p0.b, x0, x1
-; CHECK-NEXT: addvl x9, x0, #3
-; CHECK-NEXT: incb x0, all, mul #2
-; CHECK-NEXT: incb x8
-; CHECK-NEXT: whilewr p3.b, x9, x1
-; CHECK-NEXT: whilewr p2.b, x0, x1
-; CHECK-NEXT: whilewr p1.b, x8, x1
+; CHECK-NEXT: bic p3.b, p4/z, p4.b, p3.b
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = call <vscale x 64 x i1> @llvm.loop.dependence.war.mask.nxv64i1(ptr %a, ptr %b, i64 1)
@@ -176,6 +208,8 @@ define <vscale x 32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p11, [sp] // 2-byte Spill
+; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Spill
; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Spill
; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Spill
; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Spill
@@ -186,75 +220,81 @@ define <vscale x 32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) {
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: index z0.d, #0, #1
; CHECK-NEXT: sub x8, x1, x0
-; CHECK-NEXT: incb x0, all, mul #2
-; CHECK-NEXT: add x8, x8, x8, lsr #63
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, x8, x8, lsr #63
+; CHECK-NEXT: incb x0, all, mul #2
; CHECK-NEXT: asr x8, x8, #1
-; CHECK-NEXT: sub x9, x1, x0
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z4.d, z0.d
; CHECK-NEXT: mov z5.d, x8
-; CHECK-NEXT: add x9, x9, x9, lsr #63
; CHECK-NEXT: incd z1.d
; CHECK-NEXT: incd z2.d, all, mul #2
-; CHECK-NEXT: incd z3.d, all, mul #4
-; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z0.d
-; CHECK-NEXT: asr x9, x9, #1
-; CHECK-NEXT: mov z4.d, z1.d
-; CHECK-NEXT: mov z6.d, z1.d
-; CHECK-NEXT: mov z7.d, z2.d
-; CHECK-NEXT: cmphi p1.d, p0/z, z5.d, z1.d
-; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z3.d
+; CHECK-NEXT: incd z4.d, all, mul #4
+; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z0.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z2.d
+; CHECK-NEXT: mov z7.d, z1.d
+; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z4.d
; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z2.d
-; CHECK-NEXT: incd z4.d, all, mul #2
+; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z1.d
+; CHECK-NEXT: incd z3.d, all, mul #2
; CHECK-NEXT: incd z6.d, all, mul #4
; CHECK-NEXT: incd z7.d, all, mul #4
-; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s
-; CHECK-NEXT: mov z24.d, z4.d
-; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z6.d
-; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z4.d
-; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z7.d
+; CHECK-NEXT: uzp1 p6.s, p7.s, p6.s
+; CHECK-NEXT: mov z24.d, z3.d
+; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z6.d
+; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z7.d
+; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z3.d
; CHECK-NEXT: incd z24.d, all, mul #4
-; CHECK-NEXT: uzp1 p2.s, p3.s, p4.s
-; CHECK-NEXT: uzp1 p3.s, p5.s, p6.s
-; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z24.d
-; CHECK-NEXT: mov z5.d, x9
+; CHECK-NEXT: uzp1 p3.s, p3.s, p8.s
+; CHECK-NEXT: uzp1 p5.s, p5.s, p9.s
+; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z24.d
; CHECK-NEXT: cmp x8, #1
-; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h
+; CHECK-NEXT: uzp1 p5.h, p6.h, p5.h
; CHECK-NEXT: cset w8, lt
-; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z24.d
-; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z7.d
-; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z6.d
-; CHECK-NEXT: uzp1 p7.s, p7.s, p8.s
-; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z3.d
-; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z4.d
-; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z2.d
; CHECK-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NEXT: uzp1 p2.h, p2.h, p7.h
-; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z1.d
+; CHECK-NEXT: uzp1 p2.s, p2.s, p4.s
+; CHECK-NEXT: whilelo p1.b, xzr, x8
+; CHECK-NEXT: subs x8, x1, x0
+; CHECK-NEXT: uzp1 p2.h, p3.h, p2.h
+; CHECK-NEXT: add x8, x8, x8, lsr #63
+; CHECK-NEXT: cset w9, ls
+; CHECK-NEXT: uzp1 p2.b, p5.b, p2.b
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Reload
+; CHECK-NEXT: asr x8, x8, #1
+; CHECK-NEXT: mov z5.d, x8
+; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z24.d
+; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z6.d
+; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z7.d
+; CHECK-NEXT: cmphi p10.d, p0/z, z5.d, z4.d
+; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z3.d
+; CHECK-NEXT: cmphi p11.d, p0/z, z5.d, z2.d
+; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z1.d
; CHECK-NEXT: cmphi p0.d, p0/z, z5.d, z0.d
-; CHECK-NEXT: uzp1 p4.s, p5.s, p4.s
-; CHECK-NEXT: uzp1 p5.s, p9.s, p6.s
+; CHECK-NEXT: cmp x8, #1
+; CHECK-NEXT: uzp1 p7.s, p9.s, p7.s
+; CHECK-NEXT: cset w8, lt
; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Reload
-; CHECK-NEXT: whilelo p6.b, xzr, x8
-; CHECK-NEXT: uzp1 p3.s, p8.s, p3.s
-; CHECK-NEXT: cmp x9, #1
+; CHECK-NEXT: uzp1 p8.s, p10.s, p8.s
+; CHECK-NEXT: sbfx x8, x8, #0, #1
+; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Reload
+; CHECK-NEXT: uzp1 p6.s, p11.s, p6.s
+; CHECK-NEXT: ldr p11, [sp] // 2-byte Reload
+; CHECK-NEXT: uzp1 p0.s, p0.s, p4.s
+; CHECK-NEXT: uzp1 p4.h, p8.h, p7.h
; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Reload
-; CHECK-NEXT: uzp1 p0.s, p0.s, p7.s
-; CHECK-NEXT: cset w8, lt
+; CHECK-NEXT: uzp1 p0.h, p0.h, p6.h
; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT: uzp1 p4.h, p5.h, p4.h
-; CHECK-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Reload
-; CHECK-NEXT: uzp1 p0.h, p0.h, p3.h
-; CHECK-NEXT: uzp1 p1.b, p1.b, p2.b
-; CHECK-NEXT: uzp1 p2.b, p0.b, p4.b
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Reload
; CHECK-NEXT: whilelo p3.b, xzr, x8
-; CHECK-NEXT: sel p0.b, p1, p1.b, p6.b
+; CHECK-NEXT: sbfx x8, x9, #0, #1
; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Reload
-; CHECK-NEXT: sel p1.b, p2, p2.b, p3.b
+; CHECK-NEXT: uzp1 p0.b, p0.b, p4.b
+; CHECK-NEXT: whilelo p4.b, xzr, x8
+; CHECK-NEXT: mov p3.b, p0/m, p0.b
+; CHECK-NEXT: sel p0.b, p2, p2.b, p1.b
+; CHECK-NEXT: bic p1.b, p3/z, p3.b, p4.b
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Reload
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -364,6 +404,7 @@ define <vscale x 32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p11, [sp] // 2-byte Spill
; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Spill
; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Spill
; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Spill
@@ -387,65 +428,70 @@ define <vscale x 32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
; CHECK-NEXT: incd z1.d
; CHECK-NEXT: incd z2.d, all, mul #2
; CHECK-NEXT: incd z4.d, all, mul #4
-; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z0.d
+; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z0.d
; CHECK-NEXT: mov z3.d, z1.d
; CHECK-NEXT: mov z6.d, z2.d
; CHECK-NEXT: mov z7.d, z1.d
-; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z4.d
-; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z2.d
-; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z1.d
+; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z4.d
+; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z2.d
+; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z1.d
; CHECK-NEXT: incd z3.d, all, mul #2
; CHECK-NEXT: incd z6.d, all, mul #4
; CHECK-NEXT: incd z7.d, all, mul #4
-; CHECK-NEXT: uzp1 p4.s, p5.s, p4.s
+; CHECK-NEXT: uzp1 p6.s, p7.s, p6.s
; CHECK-NEXT: mov z24.d, z3.d
-; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z6.d
-; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z7.d
-; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z3.d
+; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z6.d
+; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z7.d
+; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z3.d
; CHECK-NEXT: incd z24.d, all, mul #4
-; CHECK-NEXT: uzp1 p2.s, p2.s, p7.s
; CHECK-NEXT: uzp1 p3.s, p3.s, p8.s
-; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z24.d
+; CHECK-NEXT: uzp1 p5.s, p5.s, p9.s
+; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z24.d
; CHECK-NEXT: cmp x8, #1
-; CHECK-NEXT: uzp1 p3.h, p4.h, p3.h
+; CHECK-NEXT: uzp1 p5.h, p6.h, p5.h
; CHECK-NEXT: cset w8, lt
; CHECK-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s
+; CHECK-NEXT: uzp1 p2.s, p2.s, p4.s
; CHECK-NEXT: whilelo p1.b, xzr, x8
; CHECK-NEXT: subs x8, x1, x0
-; CHECK-NEXT: uzp1 p2.h, p2.h, p6.h
+; CHECK-NEXT: uzp1 p2.h, p3.h, p2.h
; CHECK-NEXT: add x9, x8, #3
; CHECK-NEXT: csel x8, x9, x8, mi
-; CHECK-NEXT: uzp1 p2.b, p3.b, p2.b
+; CHECK-NEXT: cset w9, ls
+; CHECK-NEXT: uzp1 p2.b, p5.b, p2.b
; CHECK-NEXT: asr x8, x8, #2
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Reload
; CHECK-NEXT: mov z5.d, x8
-; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z24.d
-; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z6.d
+; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z24.d
+; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z6.d
; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z7.d
-; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z4.d
-; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z3.d
-; CHECK-NEXT: cmphi p10.d, p0/z, z5.d, z2.d
-; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z1.d
+; CHECK-NEXT: cmphi p10.d, p0/z, z5.d, z4.d
+; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z3.d
+; CHECK-NEXT: cmphi p11.d, p0/z, z5.d, z2.d
+; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z1.d
; CHECK-NEXT: cmphi p0.d, p0/z, z5.d, z0.d
; CHECK-NEXT: cmp x8, #1
-; CHECK-NEXT: uzp1 p5.s, p7.s, p5.s
+; CHECK-NEXT: uzp1 p7.s, p9.s, p7.s
; CHECK-NEXT: cset w8, lt
-; CHECK-NEXT: uzp1 p7.s, p9.s, p8.s
-; CHECK-NEXT: sbfx x8, x8, #0, #1
; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Reload
-; CHECK-NEXT: uzp1 p4.s, p10.s, p4.s
+; CHECK-NEXT: uzp1 p8.s, p10.s, p8.s
+; CHECK-NEXT: sbfx x8, x8, #0, #1
; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Reload
-; CHECK-NEXT: uzp1 p0.s, p0.s, p6.s
+; CHECK-NEXT: uzp1 p6.s, p11.s, p6.s
+; CHECK-NEXT: ldr p11, [sp] // 2-byte Reload
+; CHECK-NEXT: uzp1 p0.s, p0.s, p4.s
+; CHECK-NEXT: uzp1 p4.h, p8.h, p7.h
; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Reload
-; CHECK-NEXT: uzp1 p5.h, p7.h, p5.h
+; CHECK-NEXT: uzp1 p0.h, p0.h, p6.h
; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT: uzp1 p0.h, p0.h, p4.h
+; CHECK-NEXT: whilelo p3.b, xzr, x8
+; CHECK-NEXT: sbfx x8, x9, #0, #1
; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Reload
+; CHECK-NEXT: uzp1 p0.b, p0.b, p4.b
; CHECK-NEXT: whilelo p4.b, xzr, x8
-; CHECK-NEXT: uzp1 p3.b, p0.b, p5.b
-; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Reload
+; CHECK-NEXT: mov p3.b, p0/m, p0.b
; CHECK-NEXT: sel p0.b, p2, p2.b, p1.b
-; CHECK-NEXT: sel p1.b, p3, p3.b, p4.b
+; CHECK-NEXT: bic p1.b, p3/z, p3.b, p4.b
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Reload
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -582,6 +628,7 @@ define <vscale x 32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p11, [sp] // 2-byte Spill
; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Spill
; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Spill
; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Spill
@@ -605,65 +652,70 @@ define <vscale x 32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) {
; CHECK-NEXT: incd z1.d
; CHECK-NEXT: incd z2.d, all, mul #2
; CHECK-NEXT: incd z4.d, all, mul #4
-; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z0.d
+; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z0.d
; CHECK-NEXT: mov z3.d, z1.d
; CHECK-NEXT: mov z6.d, z2.d
; CHECK-NEXT: mov z7.d, z1.d
-; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z4.d
-; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z2.d
-; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z1.d
+; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z4.d
+; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z2.d
+; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z1.d
; CHECK-NEXT: incd z3.d, all, mul #2
; CHECK-NEXT: incd z6.d, all, mul #4
; CHECK-NEXT: incd z7.d, all, mul #4
-; CHECK-NEXT: uzp1 p4.s, p5.s, p4.s
+; CHECK-NEXT: uzp1 p6.s, p7.s, p6.s
; CHECK-NEXT: mov z24.d, z3.d
-; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z6.d
-; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z7.d
-; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z3.d
+; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z6.d
+; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z7.d
+; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z3.d
; CHECK-NEXT: incd z24.d, all, mul #4
-; CHECK-NEXT: uzp1 p2.s, p2.s, p7.s
; CHECK-NEXT: uzp1 p3.s, p3.s, p8.s
-; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z24.d
+; CHECK-NEXT: uzp1 p5.s, p5.s, p9.s
+; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z24.d
; CHECK-NEXT: cmp x8, #1
-; CHECK-NEXT: uzp1 p3.h, p4.h, p3.h
+; CHECK-NEXT: uzp1 p5.h, p6.h, p5.h
; CHECK-NEXT: cset w8, lt
; CHECK-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s
+; CHECK-NEXT: uzp1 p2.s, p2.s, p4.s
; CHECK-NEXT: whilelo p1.b, xzr, x8
; CHECK-NEXT: subs x8, x1, x9
-; CHECK-NEXT: uzp1 p2.h, p2.h, p6.h
+; CHECK-NEXT: uzp1 p2.h, p3.h, p2.h
; CHECK-NEXT: add x9, x8, #7
; CHECK-NEXT: csel x8, x9, x8, mi
-; CHECK-NEXT: uzp1 p2.b, p3.b, p2.b
+; CHECK-NEXT: cset w9, ls
+; CHECK-NEXT: uzp1 p2.b, p5.b, p2.b
; CHECK-NEXT: asr x8, x8, #3
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Reload
; CHECK-NEXT: mov z5.d, x8
-; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z24.d
-; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z6.d
+; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z24.d
+; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z6.d
; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z7.d
-; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z4.d
-; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z3.d
-; CHECK-NEXT: cmphi p10.d, p0/z, z5.d, z2.d
-; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z1.d
+; CHECK-NEXT: cmphi p10.d, p0/z, z5.d, z4.d
+; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z3.d
+; CHECK-NEXT: cmphi p11.d, p0/z, z5.d, z2.d
+; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z1.d
; CHECK-NEXT: cmphi p0.d, p0/z, z5.d, z0.d
; CHECK-NEXT: cmp x8, #1
-; CHECK-NEXT: uzp1 p5.s, p7.s, p5.s
+; CHECK-NEXT: uzp1 p7.s, p9.s, p7.s
; CHECK-NEXT: cset w8, lt
-; CHECK-NEXT: uzp1 p7.s, p9.s, p8.s
-; CHECK-NEXT: sbfx x8, x8, #0, #1
; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Reload
-; CHECK-NEXT: uzp1 p4.s, p10.s, p4.s
+; CHECK-NEXT: uzp1 p8.s, p10.s, p8.s
+; CHECK-NEXT: sbfx x8, x8, #0, #1
; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Reload
-; CHECK-NEXT: uzp1 p0.s, p0.s, p6.s
+; CHECK-NEXT: uzp1 p6.s, p11.s, p6.s
+; CHECK-NEXT: ldr p11, [sp] // 2-byte Reload
+; CHECK-NEXT: uzp1 p0.s, p0.s, p4.s
+; CHECK-NEXT: uzp1 p4.h, p8.h, p7.h
; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Reload
-; CHECK-NEXT: uzp1 p5.h, p7.h, p5.h
+; CHECK-NEXT: uzp1 p0.h, p0.h, p6.h
; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT: uzp1 p0.h, p0.h, p4.h
+; CHECK-NEXT: whilelo p3.b, xzr, x8
+; CHECK-NEXT: sbfx x8, x9, #0, #1
; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Reload
+; CHECK-NEXT: uzp1 p0.b, p0.b, p4.b
; CHECK-NEXT: whilelo p4.b, xzr, x8
-; CHECK-NEXT: uzp1 p3.b, p0.b, p5.b
-; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Reload
+; CHECK-NEXT: mov p3.b, p0/m, p0.b
; CHECK-NEXT: sel p0.b, p2, p2.b, p1.b
-; CHECK-NEXT: sel p1.b, p3, p3.b, p4.b
+; CHECK-NEXT: bic p1.b, p3/z, p3.b, p4.b
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Reload
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
More information about the llvm-commits
mailing list