[llvm] [AArch64] Lower alias mask to a whilewr (PR #100769)
Sam Tebbs via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 30 08:57:53 PDT 2024
https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/100769
>From c943b046e5eeb5faae74783b80137593a43760e6 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 26 Jun 2024 09:55:45 +0100
Subject: [PATCH 01/12] [AArch64] Lower alias mask to a whilewr
https://github.com/llvm/llvm-project/pull/100579 emits IR that creates a
mask disabling lanes that could alias within a loop iteration, based on
a pair of pointers. This PR lowers that IR to a WHILEWR instruction for
AArch64.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 82 ++
.../LoopVectorize/AArch64/alias_mask.ll | 884 ++++++++++++++++++
2 files changed, 966 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d86e52d49000a..c2e9ba6291855 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -94,6 +94,7 @@
#include <bitset>
#include <cassert>
#include <cctype>
+#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <iterator>
@@ -1523,6 +1524,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+ setOperationAction(ISD::OR, VT, Custom);
setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
@@ -13782,8 +13784,88 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
return ResultSLI;
}
+/// Try to lower the construction of a pointer alias mask to a WHILEWR.
+/// The mask's enabled lanes represent the elements that will not overlap across one loop iteration.
+/// This tries to match:
+/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))),
+/// (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size))
+SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
+ if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE2())
+ return SDValue();
+ auto LaneMask = Op.getOperand(0);
+ auto Splat = Op.getOperand(1);
+
+ if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
+ LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask ||
+ Splat.getOpcode() != ISD::SPLAT_VECTOR)
+ return SDValue();
+
+ auto Cmp = Splat.getOperand(0);
+ if (Cmp.getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ CondCodeSDNode *Cond = dyn_cast<CondCodeSDNode>(Cmp.getOperand(2));
+ assert(Cond && "SETCC doesn't have a condition code");
+
+ auto ComparatorConst = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
+ if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 ||
+ Cond->get() != ISD::CondCode::SETLT)
+ return SDValue();
+ unsigned CompValue = std::abs(ComparatorConst->getSExtValue());
+ unsigned EltSize = CompValue + 1;
+ if (!isPowerOf2_64(EltSize) || EltSize > 64)
+ return SDValue();
+
+ auto Diff = Cmp.getOperand(0);
+ if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64)
+ return SDValue();
+
+ auto LaneMaskConst = dyn_cast<ConstantSDNode>(LaneMask.getOperand(1));
+ if (!LaneMaskConst || LaneMaskConst->getZExtValue() != 0 ||
+ (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA))
+ return SDValue();
+
+ // An alias mask for i8 elements omits the division because it would just divide by 1
+ if (EltSize > 1) {
+ auto DiffDiv = LaneMask.getOperand(2);
+ auto DiffDivConst = dyn_cast<ConstantSDNode>(DiffDiv.getOperand(1));
+ if (!DiffDivConst || DiffDivConst->getZExtValue() != std::log2(EltSize))
+ return SDValue();
+ } else if (LaneMask.getOperand(2) != Diff)
+ return SDValue();
+
+ auto StorePtr = Diff.getOperand(0);
+ auto ReadPtr = Diff.getOperand(1);
+
+ unsigned IntrinsicID = 0;
+ switch (EltSize) {
+ case 1:
+ IntrinsicID = Intrinsic::aarch64_sve_whilewr_b;
+ break;
+ case 2:
+ IntrinsicID = Intrinsic::aarch64_sve_whilewr_h;
+ break;
+ case 4:
+ IntrinsicID = Intrinsic::aarch64_sve_whilewr_s;
+ break;
+ case 8:
+ IntrinsicID = Intrinsic::aarch64_sve_whilewr_d;
+ break;
+ default:
+ return SDValue();
+ }
+ SDLoc DL(Op);
+ SDValue ID = DAG.getConstant(IntrinsicID, DL, MVT::i32);
+ auto N = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID,
+ StorePtr, ReadPtr);
+ return N;
+}
+
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
+
+ if (SDValue SV = tryWhileWRFromOR(Op, DAG))
+ return SV;
if (useSVEForFixedLengthVectorVT(Op.getValueType(),
!Subtarget->isNeonAvailable()))
return LowerToScalableOp(Op, DAG);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
new file mode 100644
index 0000000000000..3662efa41c151
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -0,0 +1,884 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve2 -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2
+define dso_local void @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB0_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: whilewr p0.b, x1, x2
+; CHECK-NEXT: mov w9, w3
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: cntp x10, p0, p0.b
+; CHECK-NEXT: and x10, x10, #0xff
+; CHECK-NEXT: .LBB0_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-NEXT: add z0.b, z1.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8]
+; CHECK-NEXT: add x8, x8, x10
+; CHECK-NEXT: whilelo p1.b, x8, x9
+; CHECK-NEXT: b.mi .LBB0_2
+; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_8:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB0_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: sub x9, x1, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9
+; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b
+; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
+; CHECK-NOSVE2-NEXT: .LBB0_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b
+; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB0_2
+; CHECK-NOSVE2-NEXT: .LBB0_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp11 = icmp sgt i32 %n, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %c14 = ptrtoint ptr %c to i64
+ %b15 = ptrtoint ptr %b to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %sub.diff = sub i64 %b15, %c14
+ %neg.compare = icmp slt i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
+ %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
+ %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+ %0 = zext <vscale x 16 x i1> %active.lane.mask.alias to <vscale x 16 x i8>
+ %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %0)
+ %2 = zext i8 %1 to i64
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %3 = and <vscale x 16 x i1> %active.lane.mask, %active.lane.mask.alias
+ %4 = getelementptr inbounds i8, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
+ %5 = getelementptr inbounds i8, ptr %b, i64 %index
+ %wide.masked.load16 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
+ %6 = add <vscale x 16 x i8> %wide.masked.load16, %wide.masked.load
+ %7 = getelementptr inbounds i8, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %6, ptr %7, i32 1, <vscale x 16 x i1> %3)
+ %index.next = add i64 %index, %2
+ %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
+ br i1 %8, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB1_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: mov w8, w3
+; CHECK-NEXT: whilewr p1.h, x1, x2
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: .LBB1_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
+; CHECK-NEXT: add z0.h, z1.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x2, x9, lsl #1]
+; CHECK-NEXT: inch x9
+; CHECK-NEXT: whilelo p0.h, x9, x8
+; CHECK-NEXT: b.mi .LBB1_2
+; CHECK-NEXT: .LBB1_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_16:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB1_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sub x10, x1, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9
+; CHECK-NOSVE2-NEXT: cmn x10, #1
+; CHECK-NOSVE2-NEXT: add x10, x10, x10, lsr #63
+; CHECK-NOSVE2-NEXT: cset w11, lt
+; CHECK-NOSVE2-NEXT: sbfx x11, x11, #0, #1
+; CHECK-NOSVE2-NEXT: asr x10, x10, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x11
+; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10
+; CHECK-NOSVE2-NEXT: cnth x10
+; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: .LBB1_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h
+; CHECK-NOSVE2-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p0.h, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB1_2
+; CHECK-NOSVE2-NEXT: .LBB1_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp11 = icmp sgt i32 %n, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %b14 = ptrtoint ptr %b to i64
+ %c15 = ptrtoint ptr %c to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 3
+ %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
+ %sub.diff = sub i64 %b14, %c15
+ %diff = sdiv i64 %sub.diff, 2
+ %neg.compare = icmp slt i64 %sub.diff, -1
+ %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+ %2 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.entry
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 8 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %3 = getelementptr inbounds i16, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
+ %4 = getelementptr inbounds i16, ptr %b, i64 %index
+ %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
+ %5 = add <vscale x 8 x i16> %wide.masked.load16, %wide.masked.load
+ %6 = getelementptr inbounds i16, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %5, ptr %6, i32 2, <vscale x 8 x i1> %active.lane.mask)
+ %index.next = add i64 %index, %1
+ %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %7 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
+ br i1 %7, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB2_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: mov w8, w3
+; CHECK-NEXT: whilewr p1.s, x1, x2
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: .LBB2_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
+; CHECK-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x2, x9, lsl #2]
+; CHECK-NEXT: incw x9
+; CHECK-NEXT: whilelo p0.s, x9, x8
+; CHECK-NEXT: b.mi .LBB2_2
+; CHECK-NEXT: .LBB2_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_32:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB2_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sub x10, x1, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9
+; CHECK-NOSVE2-NEXT: add x11, x10, #3
+; CHECK-NOSVE2-NEXT: cmp x10, #0
+; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt
+; CHECK-NOSVE2-NEXT: cmn x10, #3
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: asr x11, x11, #2
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x11
+; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10
+; CHECK-NOSVE2-NEXT: cntw x10
+; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: .LBB2_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NOSVE2-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p0.s, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB2_2
+; CHECK-NOSVE2-NEXT: .LBB2_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp9 = icmp sgt i32 %n, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %b12 = ptrtoint ptr %b to i64
+ %c13 = ptrtoint ptr %c to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 2
+ %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
+ %sub.diff = sub i64 %b12, %c13
+ %diff = sdiv i64 %sub.diff, 4
+ %neg.compare = icmp slt i64 %sub.diff, -3
+ %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+ %2 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.entry
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 4 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %3 = getelementptr inbounds i32, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
+ %4 = getelementptr inbounds i32, ptr %b, i64 %index
+ %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
+ %5 = add <vscale x 4 x i32> %wide.masked.load14, %wide.masked.load
+ %6 = getelementptr inbounds i32, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask)
+ %index.next = add i64 %index, %1
+ %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %7 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
+ br i1 %7, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB3_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: mov w8, w3
+; CHECK-NEXT: whilewr p1.d, x1, x2
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: .LBB3_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x2, x9, lsl #3]
+; CHECK-NEXT: incd x9
+; CHECK-NEXT: whilelo p0.d, x9, x8
+; CHECK-NEXT: b.mi .LBB3_2
+; CHECK-NEXT: .LBB3_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_64:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB3_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sub x10, x1, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9
+; CHECK-NOSVE2-NEXT: add x11, x10, #7
+; CHECK-NOSVE2-NEXT: cmp x10, #0
+; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt
+; CHECK-NOSVE2-NEXT: cmn x10, #7
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: asr x11, x11, #3
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x11
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10
+; CHECK-NOSVE2-NEXT: cntd x10
+; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: .LBB3_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NOSVE2-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p0.d, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB3_2
+; CHECK-NOSVE2-NEXT: .LBB3_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp9 = icmp sgt i32 %n, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %b12 = ptrtoint ptr %b to i64
+ %c13 = ptrtoint ptr %c to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 1
+ %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
+ %sub.diff = sub i64 %b12, %c13
+ %diff = sdiv i64 %sub.diff, 8
+ %neg.compare = icmp slt i64 %sub.diff, -7
+ %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
+ %2 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.entry
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 2 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %3 = getelementptr inbounds i64, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
+ %4 = getelementptr inbounds i64, ptr %b, i64 %index
+ %wide.masked.load14 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
+ %5 = add <vscale x 2 x i64> %wide.masked.load14, %wide.masked.load
+ %6 = getelementptr inbounds i64, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %5, ptr %6, i32 8, <vscale x 2 x i1> %active.lane.mask)
+ %index.next = add i64 %index, %1
+ %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %7 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
+ br i1 %7, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_multiple_8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB4_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: whilewr p0.b, x0, x2
+; CHECK-NEXT: mov w9, w3
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: whilewr p1.b, x1, x2
+; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: cntp x10, p0, p0.b
+; CHECK-NEXT: and x10, x10, #0xff
+; CHECK-NEXT: .LBB4_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-NEXT: add z0.b, z1.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8]
+; CHECK-NEXT: add x8, x8, x10
+; CHECK-NEXT: whilelo p1.b, x8, x9
+; CHECK-NEXT: b.mi .LBB4_2
+; CHECK-NEXT: .LBB4_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_multiple_8:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB4_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: sub x9, x0, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9
+; CHECK-NOSVE2-NEXT: sub x9, x1, x2
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x10
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: whilelo p3.b, xzr, x9
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: whilelo p2.b, xzr, x10
+; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b
+; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
+; CHECK-NOSVE2-NEXT: .LBB4_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b
+; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB4_2
+; CHECK-NOSVE2-NEXT: .LBB4_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp11 = icmp sgt i32 %n, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %c14 = ptrtoint ptr %c to i64
+ %a15 = ptrtoint ptr %a to i64
+ %b16 = ptrtoint ptr %b to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %sub.diff = sub i64 %a15, %c14
+ %neg.compare = icmp slt i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
+ %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
+ %sub.diff18 = sub i64 %b16, %c14
+ %neg.compare20 = icmp slt i64 %sub.diff18, 0
+ %.splatinsert21 = insertelement <vscale x 16 x i1> poison, i1 %neg.compare20, i64 0
+ %.splat22 = shufflevector <vscale x 16 x i1> %.splatinsert21, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+ %ptr.diff.lane.mask23 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff18)
+ %active.lane.mask.alias24 = or <vscale x 16 x i1> %ptr.diff.lane.mask23, %.splat22
+ %0 = and <vscale x 16 x i1> %active.lane.mask.alias, %active.lane.mask.alias24
+ %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+ %1 = zext <vscale x 16 x i1> %0 to <vscale x 16 x i8>
+ %2 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %1)
+ %3 = zext i8 %2 to i64
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %4 = and <vscale x 16 x i1> %active.lane.mask, %0
+ %5 = getelementptr inbounds i8, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %4, <vscale x 16 x i8> poison)
+ %6 = getelementptr inbounds i8, ptr %b, i64 %index
+ %wide.masked.load25 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %6, i32 1, <vscale x 16 x i1> %4, <vscale x 16 x i8> poison)
+ %7 = add <vscale x 16 x i8> %wide.masked.load25, %wide.masked.load
+ %8 = getelementptr inbounds i8, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %7, ptr %8, i32 1, <vscale x 16 x i1> %4)
+ %index.next = add i64 %index, %3
+ %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %9 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
+ br i1 %9, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_multiple_16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB5_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: whilewr p0.h, x0, x2
+; CHECK-NEXT: mov w9, w3
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: whilewr p1.h, x1, x2
+; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NEXT: whilelo p1.h, xzr, x9
+; CHECK-NEXT: cntp x10, p0, p0.h
+; CHECK-NEXT: and x10, x10, #0xff
+; CHECK-NEXT: .LBB5_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
+; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
+; CHECK-NEXT: add z0.h, z1.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1]
+; CHECK-NEXT: add x8, x8, x10
+; CHECK-NEXT: whilelo p1.h, x8, x9
+; CHECK-NEXT: b.mi .LBB5_2
+; CHECK-NEXT: .LBB5_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_multiple_16:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB5_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: sub x9, x0, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: cmn x9, #1
+; CHECK-NOSVE2-NEXT: add x9, x9, x9, lsr #63
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: asr x9, x9, #1
+; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x10
+; CHECK-NOSVE2-NEXT: sub x10, x1, x2
+; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9
+; CHECK-NOSVE2-NEXT: add x9, x10, x10, lsr #63
+; CHECK-NOSVE2-NEXT: cmn x10, #1
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: asr x9, x9, #1
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p3.h, xzr, x9
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10
+; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9
+; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.h
+; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
+; CHECK-NOSVE2-NEXT: .LBB5_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h
+; CHECK-NOSVE2-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p1.h, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB5_2
+; CHECK-NOSVE2-NEXT: .LBB5_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp11 = icmp sgt i32 %n, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %c14 = ptrtoint ptr %c to i64
+ %a15 = ptrtoint ptr %a to i64
+ %b16 = ptrtoint ptr %b to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %sub.diff = sub i64 %a15, %c14
+ %diff = sdiv i64 %sub.diff, 2
+ %neg.compare = icmp slt i64 %sub.diff, -1
+ %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+ %sub.diff18 = sub i64 %b16, %c14
+ %diff19 = sdiv i64 %sub.diff18, 2
+ %neg.compare20 = icmp slt i64 %sub.diff18, -1
+ %.splatinsert21 = insertelement <vscale x 8 x i1> poison, i1 %neg.compare20, i64 0
+ %.splat22 = shufflevector <vscale x 8 x i1> %.splatinsert21, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+ %ptr.diff.lane.mask23 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff19)
+ %active.lane.mask.alias24 = or <vscale x 8 x i1> %ptr.diff.lane.mask23, %.splat22
+ %0 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.alias24
+ %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
+ %1 = zext <vscale x 8 x i1> %0 to <vscale x 8 x i8>
+ %2 = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> %1)
+ %3 = zext i8 %2 to i64
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 8 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %4 = and <vscale x 8 x i1> %active.lane.mask, %0
+ %5 = getelementptr inbounds i16, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison)
+ %6 = getelementptr inbounds i16, ptr %b, i64 %index
+ %wide.masked.load25 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %6, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison)
+ %7 = add <vscale x 8 x i16> %wide.masked.load25, %wide.masked.load
+ %8 = getelementptr inbounds i16, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %7, ptr %8, i32 2, <vscale x 8 x i1> %4)
+ %index.next = add i64 %index, %3
+ %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %9 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
+ br i1 %9, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_multiple_32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB6_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: whilewr p0.s, x0, x2
+; CHECK-NEXT: mov w9, w3
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: whilewr p1.s, x1, x2
+; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NEXT: whilelo p1.s, xzr, x9
+; CHECK-NEXT: cntp x10, p0, p0.s
+; CHECK-NEXT: and x10, x10, #0xff
+; CHECK-NEXT: .LBB6_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
+; CHECK-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2]
+; CHECK-NEXT: add x8, x8, x10
+; CHECK-NEXT: whilelo p1.s, x8, x9
+; CHECK-NEXT: b.mi .LBB6_2
+; CHECK-NEXT: .LBB6_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_multiple_32:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB6_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: sub x9, x0, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: add x10, x9, #3
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
+; CHECK-NOSVE2-NEXT: cmn x9, #3
+; CHECK-NOSVE2-NEXT: asr x9, x10, #2
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9
+; CHECK-NOSVE2-NEXT: sub x9, x1, x2
+; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10
+; CHECK-NOSVE2-NEXT: add x10, x9, #3
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
+; CHECK-NOSVE2-NEXT: cmn x9, #3
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: asr x10, x10, #2
+; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p3.s, xzr, x10
+; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x9
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9
+; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.s
+; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
+; CHECK-NOSVE2-NEXT: .LBB6_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NOSVE2-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p1.s, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB6_2
+; CHECK-NOSVE2-NEXT: .LBB6_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp9 = icmp sgt i32 %n, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %c12 = ptrtoint ptr %c to i64
+ %a13 = ptrtoint ptr %a to i64
+ %b14 = ptrtoint ptr %b to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %sub.diff = sub i64 %a13, %c12
+ %diff = sdiv i64 %sub.diff, 4
+ %neg.compare = icmp slt i64 %sub.diff, -3
+ %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+ %sub.diff16 = sub i64 %b14, %c12
+ %diff17 = sdiv i64 %sub.diff16, 4
+ %neg.compare18 = icmp slt i64 %sub.diff16, -3
+ %.splatinsert19 = insertelement <vscale x 4 x i1> poison, i1 %neg.compare18, i64 0
+ %.splat20 = shufflevector <vscale x 4 x i1> %.splatinsert19, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+ %ptr.diff.lane.mask21 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff17)
+ %active.lane.mask.alias22 = or <vscale x 4 x i1> %ptr.diff.lane.mask21, %.splat20
+ %0 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.alias22
+ %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
+ %1 = zext <vscale x 4 x i1> %0 to <vscale x 4 x i8>
+ %2 = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %1)
+ %3 = zext i8 %2 to i64
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %4 = and <vscale x 4 x i1> %active.lane.mask, %0
+ %5 = getelementptr inbounds i32, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison)
+ %6 = getelementptr inbounds i32, ptr %b, i64 %index
+ %wide.masked.load23 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %6, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison)
+ %7 = add <vscale x 4 x i32> %wide.masked.load23, %wide.masked.load
+ %8 = getelementptr inbounds i32, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %7, ptr %8, i32 4, <vscale x 4 x i1> %4)
+ %index.next = add i64 %index, %3
+ %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %9 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
+ br i1 %9, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_multiple_64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB7_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: whilewr p0.d, x0, x2
+; CHECK-NEXT: mov w9, w3
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: whilewr p1.d, x1, x2
+; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NEXT: whilelo p1.d, xzr, x9
+; CHECK-NEXT: cntp x10, p0, p0.d
+; CHECK-NEXT: and x10, x10, #0xff
+; CHECK-NEXT: .LBB7_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3]
+; CHECK-NEXT: add x8, x8, x10
+; CHECK-NEXT: whilelo p1.d, x8, x9
+; CHECK-NEXT: b.mi .LBB7_2
+; CHECK-NEXT: .LBB7_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_multiple_64:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB7_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: sub x9, x0, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: add x10, x9, #7
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
+; CHECK-NOSVE2-NEXT: cmn x9, #7
+; CHECK-NOSVE2-NEXT: asr x9, x10, #3
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9
+; CHECK-NOSVE2-NEXT: sub x9, x1, x2
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10
+; CHECK-NOSVE2-NEXT: add x10, x9, #7
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
+; CHECK-NOSVE2-NEXT: cmn x9, #7
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: asr x10, x10, #3
+; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p3.d, xzr, x10
+; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x9
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9
+; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.d
+; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
+; CHECK-NOSVE2-NEXT: .LBB7_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NOSVE2-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p1.d, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB7_2
+; CHECK-NOSVE2-NEXT: .LBB7_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp9 = icmp sgt i32 %n, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %c12 = ptrtoint ptr %c to i64
+ %a13 = ptrtoint ptr %a to i64
+ %b14 = ptrtoint ptr %b to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %sub.diff = sub i64 %a13, %c12
+ %diff = sdiv i64 %sub.diff, 8
+ %neg.compare = icmp slt i64 %sub.diff, -7
+ %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
+ %sub.diff16 = sub i64 %b14, %c12
+ %diff17 = sdiv i64 %sub.diff16, 8
+ %neg.compare18 = icmp slt i64 %sub.diff16, -7
+ %.splatinsert19 = insertelement <vscale x 2 x i1> poison, i1 %neg.compare18, i64 0
+ %.splat20 = shufflevector <vscale x 2 x i1> %.splatinsert19, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+ %ptr.diff.lane.mask21 = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff17)
+ %active.lane.mask.alias22 = or <vscale x 2 x i1> %ptr.diff.lane.mask21, %.splat20
+ %0 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.alias22
+ %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
+ %1 = zext <vscale x 2 x i1> %0 to <vscale x 2 x i8>
+ %2 = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %1)
+ %3 = zext i8 %2 to i64
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %4 = and <vscale x 2 x i1> %active.lane.mask, %0
+ %5 = getelementptr inbounds i64, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison)
+ %6 = getelementptr inbounds i64, ptr %b, i64 %index
+ %wide.masked.load23 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %6, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison)
+ %7 = add <vscale x 2 x i64> %wide.masked.load23, %wide.masked.load
+ %8 = getelementptr inbounds i64, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %7, ptr %8, i32 8, <vscale x 2 x i1> %4)
+ %index.next = add i64 %index, %3
+ %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %9 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
+ br i1 %9, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+declare i64 @llvm.vscale.i64() #1
+
+declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) #1
+
+declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>) #2
+
+declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>) #3
+
+declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64) #1
+
+declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, <vscale x 8 x i1>, <vscale x 8 x i16>) #2
+
+declare void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16>, ptr nocapture, i32 immarg, <vscale x 8 x i1>) #3
+
+declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) #1
+
+declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>) #2
+
+declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, i32 immarg, <vscale x 4 x i1>) #3
+
+declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) #1
+
+declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i64>) #2
+
+declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr nocapture, i32 immarg, <vscale x 2 x i1>) #3
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+outline-atomics,+pauth,+ras,+rcpc,+rdm,+sme,+sme2,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
>From 54129dcb824821e8575e2b9e8005782259439dc4 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 29 Jul 2024 17:12:20 +0100
Subject: [PATCH 02/12] Add codegen test
---
llvm/test/CodeGen/AArch64/whilewr.ll | 127 +++++++++++++++++++++++++++
1 file changed, 127 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/whilewr.ll
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
new file mode 100644
index 0000000000000..84855e3898360
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve2 -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2
+define dso_local <vscale x 16 x i1> @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: whilewr p0.b, x1, x2
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_8:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: sub x8, x1, x2
+; CHECK-NOSVE2-NEXT: cmp x8, #0
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %c14 = ptrtoint ptr %c to i64
+ %b15 = ptrtoint ptr %b to i64
+ %sub.diff = sub i64 %b15, %c14
+ %neg.compare = icmp slt i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
+ %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 16 x i1> %active.lane.mask.alias
+}
+
+define dso_local <vscale x 8 x i1> @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: whilewr p0.h, x1, x2
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_16:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: sub x8, x1, x2
+; CHECK-NOSVE2-NEXT: cmn x8, #1
+; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NOSVE2-NEXT: asr x8, x8, #1
+; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9
+; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b14 = ptrtoint ptr %b to i64
+ %c15 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %b14, %c15
+ %diff = sdiv i64 %sub.diff, 2
+ %neg.compare = icmp slt i64 %sub.diff, -1
+ %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 8 x i1> %active.lane.mask.alias
+}
+
+define dso_local <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: whilewr p0.s, x1, x2
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_32:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: sub x8, x1, x2
+; CHECK-NOSVE2-NEXT: add x9, x8, #3
+; CHECK-NOSVE2-NEXT: cmp x8, #0
+; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
+; CHECK-NOSVE2-NEXT: cmn x8, #3
+; CHECK-NOSVE2-NEXT: cset w8, lt
+; CHECK-NOSVE2-NEXT: asr x9, x9, #2
+; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9
+; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b12 = ptrtoint ptr %b to i64
+ %c13 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %b12, %c13
+ %diff = sdiv i64 %sub.diff, 4
+ %neg.compare = icmp slt i64 %sub.diff, -3
+ %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 4 x i1> %active.lane.mask.alias
+}
+
+define dso_local <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: whilewr p0.d, x1, x2
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_64:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: sub x8, x1, x2
+; CHECK-NOSVE2-NEXT: add x9, x8, #7
+; CHECK-NOSVE2-NEXT: cmp x8, #0
+; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
+; CHECK-NOSVE2-NEXT: cmn x8, #7
+; CHECK-NOSVE2-NEXT: cset w8, lt
+; CHECK-NOSVE2-NEXT: asr x9, x9, #3
+; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9
+; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b12 = ptrtoint ptr %b to i64
+ %c13 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %b12, %c13
+ %diff = sdiv i64 %sub.diff, 8
+ %neg.compare = icmp slt i64 %sub.diff, -7
+ %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 2 x i1> %active.lane.mask.alias
+}
>From a667dcde8ff60f3d1fb8dc47e9ebdc2f3fea8f82 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 29 Jul 2024 17:13:01 +0100
Subject: [PATCH 03/12] format
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c2e9ba6291855..2a4daf779f1e1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13785,9 +13785,9 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
}
/// Try to lower the construction of a pointer alias mask to a WHILEWR.
-/// The mask's enabled lanes represent the elements that will not overlap across one loop iteration.
-/// This tries to match:
-/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))),
+/// The mask's enabled lanes represent the elements that will not overlap across
+/// one loop iteration. This tries to match: or (splat (setcc_lt (sub ptrA,
+/// ptrB), -(element_size - 1))),
/// (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size))
SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE2())
@@ -13825,7 +13825,8 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
(EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA))
return SDValue();
- // An alias mask for i8 elements omits the division because it would just divide by 1
+ // An alias mask for i8 elements omits the division because it would just
+ // divide by 1
if (EltSize > 1) {
auto DiffDiv = LaneMask.getOperand(2);
auto DiffDivConst = dyn_cast<ConstantSDNode>(DiffDiv.getOperand(1));
>From 9899a41fb85c918b81d8b931527cf61ea5292fdf Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 30 Jul 2024 13:48:49 +0100
Subject: [PATCH 04/12] Use Log2_64
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2a4daf779f1e1..b1faf021de95d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -94,7 +94,6 @@
#include <bitset>
#include <cassert>
#include <cctype>
-#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <iterator>
@@ -13830,7 +13829,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
if (EltSize > 1) {
auto DiffDiv = LaneMask.getOperand(2);
auto DiffDivConst = dyn_cast<ConstantSDNode>(DiffDiv.getOperand(1));
- if (!DiffDivConst || DiffDivConst->getZExtValue() != std::log2(EltSize))
+ if (!DiffDivConst || DiffDivConst->getZExtValue() != Log2_64(EltSize))
return SDValue();
} else if (LaneMask.getOperand(2) != Diff)
return SDValue();
>From 5f739bdccfd6fc2577959678855c194fb9e8242f Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 30 Jul 2024 13:49:49 +0100
Subject: [PATCH 05/12] Fix comment formatting
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b1faf021de95d..08776f26432d6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13785,8 +13785,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
/// Try to lower the construction of a pointer alias mask to a WHILEWR.
/// The mask's enabled lanes represent the elements that will not overlap across
-/// one loop iteration. This tries to match: or (splat (setcc_lt (sub ptrA,
-/// ptrB), -(element_size - 1))),
+/// one loop iteration. This tries to match:
+/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))),
/// (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size))
SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE2())
>From e69f3be14a100f54032de69dd3dce615072dc6f2 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 30 Jul 2024 13:51:11 +0100
Subject: [PATCH 06/12] Use SDValue instead of auto
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 08776f26432d6..dd9bc8151dacb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13791,15 +13791,15 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE2())
return SDValue();
- auto LaneMask = Op.getOperand(0);
- auto Splat = Op.getOperand(1);
+ SDValue LaneMask = Op.getOperand(0);
+ SDValue Splat = Op.getOperand(1);
if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask ||
Splat.getOpcode() != ISD::SPLAT_VECTOR)
return SDValue();
- auto Cmp = Splat.getOperand(0);
+ SDValue Cmp = Splat.getOperand(0);
if (Cmp.getOpcode() != ISD::SETCC)
return SDValue();
@@ -13815,7 +13815,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
if (!isPowerOf2_64(EltSize) || EltSize > 64)
return SDValue();
- auto Diff = Cmp.getOperand(0);
+ SDValue Diff = Cmp.getOperand(0);
if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64)
return SDValue();
@@ -13827,15 +13827,15 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
// An alias mask for i8 elements omits the division because it would just
// divide by 1
if (EltSize > 1) {
- auto DiffDiv = LaneMask.getOperand(2);
+ SDValue DiffDiv = LaneMask.getOperand(2);
auto DiffDivConst = dyn_cast<ConstantSDNode>(DiffDiv.getOperand(1));
if (!DiffDivConst || DiffDivConst->getZExtValue() != Log2_64(EltSize))
return SDValue();
} else if (LaneMask.getOperand(2) != Diff)
return SDValue();
- auto StorePtr = Diff.getOperand(0);
- auto ReadPtr = Diff.getOperand(1);
+ SDValue StorePtr = Diff.getOperand(0);
+ SDValue ReadPtr = Diff.getOperand(1);
unsigned IntrinsicID = 0;
switch (EltSize) {
@@ -13856,9 +13856,8 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
}
SDLoc DL(Op);
SDValue ID = DAG.getConstant(IntrinsicID, DL, MVT::i32);
- auto N = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID,
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID,
StorePtr, ReadPtr);
- return N;
}
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
>From 653d6d2d13b9b2f593199aa1086d6f19c667b4e0 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 30 Jul 2024 13:55:15 +0100
Subject: [PATCH 07/12] Check for OR operands being the other way around
---
.../Target/AArch64/AArch64ISelLowering.cpp | 3 ++
llvm/test/CodeGen/AArch64/whilewr.ll | 28 +++++++++++++++++++
2 files changed, 31 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dd9bc8151dacb..a321a622b4ae4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13794,6 +13794,9 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
SDValue LaneMask = Op.getOperand(0);
SDValue Splat = Op.getOperand(1);
+ if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
+ std::swap(LaneMask, Splat);
+
if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask ||
Splat.getOpcode() != ISD::SPLAT_VECTOR)
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
index 84855e3898360..5b9a9775e6597 100644
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -29,6 +29,34 @@ entry:
ret <vscale x 16 x i1> %active.lane.mask.alias
}
+define dso_local <vscale x 16 x i1> @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_commutative:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: whilewr p0.b, x1, x2
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_commutative:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: sub x8, x1, x2
+; CHECK-NOSVE2-NEXT: cmp x8, #0
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %c14 = ptrtoint ptr %c to i64
+ %b15 = ptrtoint ptr %b to i64
+ %sub.diff = sub i64 %b15, %c14
+ %neg.compare = icmp slt i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
+ %active.lane.mask.alias = or <vscale x 16 x i1> %.splat, %ptr.diff.lane.mask
+ ret <vscale x 16 x i1> %active.lane.mask.alias
+}
+
define dso_local <vscale x 8 x i1> @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_16:
; CHECK: // %bb.0: // %entry
>From 487ff3ca120420fd184dbb7e980332aaab302389 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 30 Jul 2024 13:55:42 +0100
Subject: [PATCH 08/12] Replace dyn_cast and assert by cast
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a321a622b4ae4..42d529e9dfdbe 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13806,8 +13806,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
if (Cmp.getOpcode() != ISD::SETCC)
return SDValue();
- CondCodeSDNode *Cond = dyn_cast<CondCodeSDNode>(Cmp.getOperand(2));
- assert(Cond && "SETCC doesn't have a condition code");
+ CondCodeSDNode *Cond = cast<CondCodeSDNode>(Cmp.getOperand(2));
auto ComparatorConst = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 ||
>From ab22dd1b375628abdd94d7de6a05a1b970a3ad6b Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 30 Jul 2024 14:29:35 +0100
Subject: [PATCH 09/12] Fix eltsize comparison and add test for it
---
.../Target/AArch64/AArch64ISelLowering.cpp | 2 +-
llvm/test/CodeGen/AArch64/whilewr.ll | 53 +++++++++++++++++++
2 files changed, 54 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 42d529e9dfdbe..dedbea0c5d0b0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13814,7 +13814,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
return SDValue();
unsigned CompValue = std::abs(ComparatorConst->getSExtValue());
unsigned EltSize = CompValue + 1;
- if (!isPowerOf2_64(EltSize) || EltSize > 64)
+ if (!isPowerOf2_64(EltSize) || EltSize > 8)
return SDValue();
SDValue Diff = Cmp.getOperand(0);
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
index 5b9a9775e6597..93ed825cf60f3 100644
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -153,3 +153,56 @@ entry:
%active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
ret <vscale x 2 x i1> %active.lane.mask.alias
}
+
+define dso_local <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: no_whilewr_128:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub x8, x1, x2
+; CHECK-NEXT: index z0.d, #0, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x9, x8, #15
+; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: csel x9, x9, x8, lt
+; CHECK-NEXT: cmn x8, #15
+; CHECK-NEXT: asr x9, x9, #4
+; CHECK-NEXT: cset w8, lt
+; CHECK-NEXT: sbfx x8, x8, #0, #1
+; CHECK-NEXT: mov z1.d, x9
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: cmphi p0.d, p0/z, z1.d, z0.d
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: no_whilewr_128:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: sub x8, x1, x2
+; CHECK-NOSVE2-NEXT: index z0.d, #0, #1
+; CHECK-NOSVE2-NEXT: ptrue p0.d
+; CHECK-NOSVE2-NEXT: add x9, x8, #15
+; CHECK-NOSVE2-NEXT: cmp x8, #0
+; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
+; CHECK-NOSVE2-NEXT: cmn x8, #15
+; CHECK-NOSVE2-NEXT: asr x9, x9, #4
+; CHECK-NOSVE2-NEXT: cset w8, lt
+; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
+; CHECK-NOSVE2-NEXT: mov z1.d, x9
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d
+; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b
+; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b12 = ptrtoint ptr %b to i64
+ %c13 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %b12, %c13
+ %diff = sdiv i64 %sub.diff, 16
+ %neg.compare = icmp slt i64 %sub.diff, -15
+ %.splatinsert = insertelement <vscale x 1 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 1 x i1> %.splatinsert, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 1 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 1 x i1> %active.lane.mask.alias
+}
>From 0c378a360ce3838cbb5e5a006a21eda7274f6872 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 30 Jul 2024 16:47:23 +0100
Subject: [PATCH 10/12] Remove O3 from test
---
llvm/test/CodeGen/AArch64/whilewr.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
index 93ed825cf60f3..bb8c43a20cb71 100644
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve2 -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2
+; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2
define dso_local <vscale x 16 x i1> @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_8:
; CHECK: // %bb.0: // %entry
>From f114dc7413173a12cd65520cf1f50a89058b6052 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 30 Jul 2024 16:47:56 +0100
Subject: [PATCH 11/12] Remove dso_local
---
llvm/test/CodeGen/AArch64/whilewr.ll | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
index bb8c43a20cb71..18a1dcf0078b7 100644
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s
; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2
-define dso_local <vscale x 16 x i1> @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+define <vscale x 16 x i1> @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: whilewr p0.b, x1, x2
@@ -29,7 +29,7 @@ entry:
ret <vscale x 16 x i1> %active.lane.mask.alias
}
-define dso_local <vscale x 16 x i1> @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+define <vscale x 16 x i1> @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_commutative:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: whilewr p0.b, x1, x2
@@ -57,7 +57,7 @@ entry:
ret <vscale x 16 x i1> %active.lane.mask.alias
}
-define dso_local <vscale x 8 x i1> @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+define <vscale x 8 x i1> @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: whilewr p0.h, x1, x2
@@ -88,7 +88,7 @@ entry:
ret <vscale x 8 x i1> %active.lane.mask.alias
}
-define dso_local <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+define <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: whilewr p0.s, x1, x2
@@ -121,7 +121,7 @@ entry:
ret <vscale x 4 x i1> %active.lane.mask.alias
}
-define dso_local <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+define <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: whilewr p0.d, x1, x2
@@ -154,7 +154,7 @@ entry:
ret <vscale x 2 x i1> %active.lane.mask.alias
}
-define dso_local <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+define <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: no_whilewr_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub x8, x1, x2
>From bbcb9359ff1dbbde365ca22cd05a5baa3d2ced24 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 30 Jul 2024 16:53:06 +0100
Subject: [PATCH 12/12] Move loop tests to whilewr.ll
---
.../Target/AArch64/AArch64ISelLowering.cpp | 2 +-
llvm/test/CodeGen/AArch64/whilewr.ll | 877 +++++++++++++++++
.../LoopVectorize/AArch64/alias_mask.ll | 884 ------------------
3 files changed, 878 insertions(+), 885 deletions(-)
delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dedbea0c5d0b0..7c35f46fb08e4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13859,7 +13859,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
SDValue ID = DAG.getConstant(IntrinsicID, DL, MVT::i32);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID,
- StorePtr, ReadPtr);
+ StorePtr, ReadPtr);
}
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
index 18a1dcf0078b7..67959112705a1 100644
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -206,3 +206,880 @@ entry:
%active.lane.mask.alias = or <vscale x 1 x i1> %ptr.diff.lane.mask, %.splat
ret <vscale x 1 x i1> %active.lane.mask.alias
}
+
+define dso_local void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_loop_8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB6_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: whilewr p0.b, x1, x2
+; CHECK-NEXT: mov w9, w3
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: cntp x10, p0, p0.b
+; CHECK-NEXT: and x10, x10, #0xff
+; CHECK-NEXT: .LBB6_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-NEXT: add z0.b, z1.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8]
+; CHECK-NEXT: add x8, x8, x10
+; CHECK-NEXT: whilelo p1.b, x8, x9
+; CHECK-NEXT: b.mi .LBB6_2
+; CHECK-NEXT: .LBB6_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_loop_8:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB6_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: sub x9, x1, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9
+; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b
+; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
+; CHECK-NOSVE2-NEXT: .LBB6_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b
+; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB6_2
+; CHECK-NOSVE2-NEXT: .LBB6_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp11 = icmp sgt i32 %n, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %c14 = ptrtoint ptr %c to i64
+ %b15 = ptrtoint ptr %b to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %sub.diff = sub i64 %b15, %c14
+ %neg.compare = icmp slt i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
+ %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
+ %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+ %0 = zext <vscale x 16 x i1> %active.lane.mask.alias to <vscale x 16 x i8>
+ %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %0)
+ %2 = zext i8 %1 to i64
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %3 = and <vscale x 16 x i1> %active.lane.mask, %active.lane.mask.alias
+ %4 = getelementptr inbounds i8, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
+ %5 = getelementptr inbounds i8, ptr %b, i64 %index
+ %wide.masked.load16 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
+ %6 = add <vscale x 16 x i8> %wide.masked.load16, %wide.masked.load
+ %7 = getelementptr inbounds i8, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %6, ptr %7, i32 1, <vscale x 16 x i1> %3)
+ %index.next = add i64 %index, %2
+ %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
+ br i1 %8, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_loop_16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB7_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: mov w8, w3
+; CHECK-NEXT: whilewr p1.h, x1, x2
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: .LBB7_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
+; CHECK-NEXT: add z0.h, z1.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x2, x9, lsl #1]
+; CHECK-NEXT: inch x9
+; CHECK-NEXT: whilelo p0.h, x9, x8
+; CHECK-NEXT: b.mi .LBB7_2
+; CHECK-NEXT: .LBB7_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_loop_16:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB7_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sub x10, x1, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9
+; CHECK-NOSVE2-NEXT: cmn x10, #1
+; CHECK-NOSVE2-NEXT: add x10, x10, x10, lsr #63
+; CHECK-NOSVE2-NEXT: cset w11, lt
+; CHECK-NOSVE2-NEXT: sbfx x11, x11, #0, #1
+; CHECK-NOSVE2-NEXT: asr x10, x10, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x11
+; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10
+; CHECK-NOSVE2-NEXT: cnth x10
+; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: .LBB7_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h
+; CHECK-NOSVE2-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p0.h, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB7_2
+; CHECK-NOSVE2-NEXT: .LBB7_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp11 = icmp sgt i32 %n, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %b14 = ptrtoint ptr %b to i64
+ %c15 = ptrtoint ptr %c to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 3
+ %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
+ %sub.diff = sub i64 %b14, %c15
+ %diff = sdiv i64 %sub.diff, 2
+ %neg.compare = icmp slt i64 %sub.diff, -1
+ %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+ %2 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.entry
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 8 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %3 = getelementptr inbounds i16, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
+ %4 = getelementptr inbounds i16, ptr %b, i64 %index
+ %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
+ %5 = add <vscale x 8 x i16> %wide.masked.load16, %wide.masked.load
+ %6 = getelementptr inbounds i16, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %5, ptr %6, i32 2, <vscale x 8 x i1> %active.lane.mask)
+ %index.next = add i64 %index, %1
+ %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %7 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
+ br i1 %7, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_loop_32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB8_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: mov w8, w3
+; CHECK-NEXT: whilewr p1.s, x1, x2
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: .LBB8_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
+; CHECK-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x2, x9, lsl #2]
+; CHECK-NEXT: incw x9
+; CHECK-NEXT: whilelo p0.s, x9, x8
+; CHECK-NEXT: b.mi .LBB8_2
+; CHECK-NEXT: .LBB8_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_loop_32:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB8_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sub x10, x1, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9
+; CHECK-NOSVE2-NEXT: add x11, x10, #3
+; CHECK-NOSVE2-NEXT: cmp x10, #0
+; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt
+; CHECK-NOSVE2-NEXT: cmn x10, #3
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: asr x11, x11, #2
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x11
+; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10
+; CHECK-NOSVE2-NEXT: cntw x10
+; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: .LBB8_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NOSVE2-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p0.s, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB8_2
+; CHECK-NOSVE2-NEXT: .LBB8_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp9 = icmp sgt i32 %n, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %b12 = ptrtoint ptr %b to i64
+ %c13 = ptrtoint ptr %c to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 2
+ %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
+ %sub.diff = sub i64 %b12, %c13
+ %diff = sdiv i64 %sub.diff, 4
+ %neg.compare = icmp slt i64 %sub.diff, -3
+ %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+ %2 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.entry
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 4 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %3 = getelementptr inbounds i32, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
+ %4 = getelementptr inbounds i32, ptr %b, i64 %index
+ %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
+ %5 = add <vscale x 4 x i32> %wide.masked.load14, %wide.masked.load
+ %6 = getelementptr inbounds i32, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask)
+ %index.next = add i64 %index, %1
+ %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %7 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
+ br i1 %7, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_loop_64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB9_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: mov w8, w3
+; CHECK-NEXT: whilewr p1.d, x1, x2
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: .LBB9_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x2, x9, lsl #3]
+; CHECK-NEXT: incd x9
+; CHECK-NEXT: whilelo p0.d, x9, x8
+; CHECK-NEXT: b.mi .LBB9_2
+; CHECK-NEXT: .LBB9_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_loop_64:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB9_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sub x10, x1, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9
+; CHECK-NOSVE2-NEXT: add x11, x10, #7
+; CHECK-NOSVE2-NEXT: cmp x10, #0
+; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt
+; CHECK-NOSVE2-NEXT: cmn x10, #7
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: asr x11, x11, #3
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x11
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10
+; CHECK-NOSVE2-NEXT: cntd x10
+; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: .LBB9_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NOSVE2-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p0.d, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB9_2
+; CHECK-NOSVE2-NEXT: .LBB9_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp9 = icmp sgt i32 %n, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %b12 = ptrtoint ptr %b to i64
+ %c13 = ptrtoint ptr %c to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 1
+ %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
+ %sub.diff = sub i64 %b12, %c13
+ %diff = sdiv i64 %sub.diff, 8
+ %neg.compare = icmp slt i64 %sub.diff, -7
+ %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
+ %2 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.entry
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 2 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %3 = getelementptr inbounds i64, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
+ %4 = getelementptr inbounds i64, ptr %b, i64 %index
+ %wide.masked.load14 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
+ %5 = add <vscale x 2 x i64> %wide.masked.load14, %wide.masked.load
+ %6 = getelementptr inbounds i64, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %5, ptr %6, i32 8, <vscale x 2 x i1> %active.lane.mask)
+ %index.next = add i64 %index, %1
+ %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %7 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
+ br i1 %7, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_loop_multiple_8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB10_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: whilewr p0.b, x0, x2
+; CHECK-NEXT: mov w9, w3
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: whilewr p1.b, x1, x2
+; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: cntp x10, p0, p0.b
+; CHECK-NEXT: and x10, x10, #0xff
+; CHECK-NEXT: .LBB10_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-NEXT: add z0.b, z1.b, z0.b
+; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8]
+; CHECK-NEXT: add x8, x8, x10
+; CHECK-NEXT: whilelo p1.b, x8, x9
+; CHECK-NEXT: b.mi .LBB10_2
+; CHECK-NEXT: .LBB10_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_8:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB10_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: sub x9, x0, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9
+; CHECK-NOSVE2-NEXT: sub x9, x1, x2
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x10
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: whilelo p3.b, xzr, x9
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: whilelo p2.b, xzr, x10
+; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b
+; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
+; CHECK-NOSVE2-NEXT: .LBB10_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b
+; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB10_2
+; CHECK-NOSVE2-NEXT: .LBB10_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp11 = icmp sgt i32 %n, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %c14 = ptrtoint ptr %c to i64
+ %a15 = ptrtoint ptr %a to i64
+ %b16 = ptrtoint ptr %b to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %sub.diff = sub i64 %a15, %c14
+ %neg.compare = icmp slt i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
+ %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
+ %sub.diff18 = sub i64 %b16, %c14
+ %neg.compare20 = icmp slt i64 %sub.diff18, 0
+ %.splatinsert21 = insertelement <vscale x 16 x i1> poison, i1 %neg.compare20, i64 0
+ %.splat22 = shufflevector <vscale x 16 x i1> %.splatinsert21, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+ %ptr.diff.lane.mask23 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff18)
+ %active.lane.mask.alias24 = or <vscale x 16 x i1> %ptr.diff.lane.mask23, %.splat22
+ %0 = and <vscale x 16 x i1> %active.lane.mask.alias, %active.lane.mask.alias24
+ %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+ %1 = zext <vscale x 16 x i1> %0 to <vscale x 16 x i8>
+ %2 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %1)
+ %3 = zext i8 %2 to i64
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %4 = and <vscale x 16 x i1> %active.lane.mask, %0
+ %5 = getelementptr inbounds i8, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %4, <vscale x 16 x i8> poison)
+ %6 = getelementptr inbounds i8, ptr %b, i64 %index
+ %wide.masked.load25 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %6, i32 1, <vscale x 16 x i1> %4, <vscale x 16 x i8> poison)
+ %7 = add <vscale x 16 x i8> %wide.masked.load25, %wide.masked.load
+ %8 = getelementptr inbounds i8, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %7, ptr %8, i32 1, <vscale x 16 x i1> %4)
+ %index.next = add i64 %index, %3
+ %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %9 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
+ br i1 %9, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_loop_multiple_16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB11_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: whilewr p0.h, x0, x2
+; CHECK-NEXT: mov w9, w3
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: whilewr p1.h, x1, x2
+; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NEXT: whilelo p1.h, xzr, x9
+; CHECK-NEXT: cntp x10, p0, p0.h
+; CHECK-NEXT: and x10, x10, #0xff
+; CHECK-NEXT: .LBB11_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
+; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
+; CHECK-NEXT: add z0.h, z1.h, z0.h
+; CHECK-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1]
+; CHECK-NEXT: add x8, x8, x10
+; CHECK-NEXT: whilelo p1.h, x8, x9
+; CHECK-NEXT: b.mi .LBB11_2
+; CHECK-NEXT: .LBB11_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_16:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB11_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: sub x9, x0, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: cmn x9, #1
+; CHECK-NOSVE2-NEXT: add x9, x9, x9, lsr #63
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: asr x9, x9, #1
+; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x10
+; CHECK-NOSVE2-NEXT: sub x10, x1, x2
+; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9
+; CHECK-NOSVE2-NEXT: add x9, x10, x10, lsr #63
+; CHECK-NOSVE2-NEXT: cmn x10, #1
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: asr x9, x9, #1
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p3.h, xzr, x9
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10
+; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9
+; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.h
+; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
+; CHECK-NOSVE2-NEXT: .LBB11_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h
+; CHECK-NOSVE2-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p1.h, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB11_2
+; CHECK-NOSVE2-NEXT: .LBB11_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp11 = icmp sgt i32 %n, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %c14 = ptrtoint ptr %c to i64
+ %a15 = ptrtoint ptr %a to i64
+ %b16 = ptrtoint ptr %b to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %sub.diff = sub i64 %a15, %c14
+ %diff = sdiv i64 %sub.diff, 2
+ %neg.compare = icmp slt i64 %sub.diff, -1
+ %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+ %sub.diff18 = sub i64 %b16, %c14
+ %diff19 = sdiv i64 %sub.diff18, 2
+ %neg.compare20 = icmp slt i64 %sub.diff18, -1
+ %.splatinsert21 = insertelement <vscale x 8 x i1> poison, i1 %neg.compare20, i64 0
+ %.splat22 = shufflevector <vscale x 8 x i1> %.splatinsert21, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+ %ptr.diff.lane.mask23 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff19)
+ %active.lane.mask.alias24 = or <vscale x 8 x i1> %ptr.diff.lane.mask23, %.splat22
+ %0 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.alias24
+ %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
+ %1 = zext <vscale x 8 x i1> %0 to <vscale x 8 x i8>
+ %2 = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> %1)
+ %3 = zext i8 %2 to i64
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 8 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %4 = and <vscale x 8 x i1> %active.lane.mask, %0
+ %5 = getelementptr inbounds i16, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison)
+ %6 = getelementptr inbounds i16, ptr %b, i64 %index
+ %wide.masked.load25 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %6, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison)
+ %7 = add <vscale x 8 x i16> %wide.masked.load25, %wide.masked.load
+ %8 = getelementptr inbounds i16, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %7, ptr %8, i32 2, <vscale x 8 x i1> %4)
+ %index.next = add i64 %index, %3
+ %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %9 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
+ br i1 %9, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_loop_multiple_32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB12_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: whilewr p0.s, x0, x2
+; CHECK-NEXT: mov w9, w3
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: whilewr p1.s, x1, x2
+; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NEXT: whilelo p1.s, xzr, x9
+; CHECK-NEXT: cntp x10, p0, p0.s
+; CHECK-NEXT: and x10, x10, #0xff
+; CHECK-NEXT: .LBB12_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
+; CHECK-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2]
+; CHECK-NEXT: add x8, x8, x10
+; CHECK-NEXT: whilelo p1.s, x8, x9
+; CHECK-NEXT: b.mi .LBB12_2
+; CHECK-NEXT: .LBB12_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_32:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB12_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: sub x9, x0, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: add x10, x9, #3
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
+; CHECK-NOSVE2-NEXT: cmn x9, #3
+; CHECK-NOSVE2-NEXT: asr x9, x10, #2
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9
+; CHECK-NOSVE2-NEXT: sub x9, x1, x2
+; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10
+; CHECK-NOSVE2-NEXT: add x10, x9, #3
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
+; CHECK-NOSVE2-NEXT: cmn x9, #3
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: asr x10, x10, #2
+; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p3.s, xzr, x10
+; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x9
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9
+; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.s
+; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
+; CHECK-NOSVE2-NEXT: .LBB12_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NOSVE2-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p1.s, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB12_2
+; CHECK-NOSVE2-NEXT: .LBB12_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp9 = icmp sgt i32 %n, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %c12 = ptrtoint ptr %c to i64
+ %a13 = ptrtoint ptr %a to i64
+ %b14 = ptrtoint ptr %b to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %sub.diff = sub i64 %a13, %c12
+ %diff = sdiv i64 %sub.diff, 4
+ %neg.compare = icmp slt i64 %sub.diff, -3
+ %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+ %sub.diff16 = sub i64 %b14, %c12
+ %diff17 = sdiv i64 %sub.diff16, 4
+ %neg.compare18 = icmp slt i64 %sub.diff16, -3
+ %.splatinsert19 = insertelement <vscale x 4 x i1> poison, i1 %neg.compare18, i64 0
+ %.splat20 = shufflevector <vscale x 4 x i1> %.splatinsert19, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+ %ptr.diff.lane.mask21 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff17)
+ %active.lane.mask.alias22 = or <vscale x 4 x i1> %ptr.diff.lane.mask21, %.splat20
+ %0 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.alias22
+ %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
+ %1 = zext <vscale x 4 x i1> %0 to <vscale x 4 x i8>
+ %2 = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %1)
+ %3 = zext i8 %2 to i64
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %4 = and <vscale x 4 x i1> %active.lane.mask, %0
+ %5 = getelementptr inbounds i32, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison)
+ %6 = getelementptr inbounds i32, ptr %b, i64 %index
+ %wide.masked.load23 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %6, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison)
+ %7 = add <vscale x 4 x i32> %wide.masked.load23, %wide.masked.load
+ %8 = getelementptr inbounds i32, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %7, ptr %8, i32 4, <vscale x 4 x i1> %4)
+ %index.next = add i64 %index, %3
+ %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %9 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
+ br i1 %9, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define dso_local void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_loop_multiple_64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB13_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: whilewr p0.d, x0, x2
+; CHECK-NEXT: mov w9, w3
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: whilewr p1.d, x1, x2
+; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NEXT: whilelo p1.d, xzr, x9
+; CHECK-NEXT: cntp x10, p0, p0.d
+; CHECK-NEXT: and x10, x10, #0xff
+; CHECK-NEXT: .LBB13_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
+; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3]
+; CHECK-NEXT: add x8, x8, x10
+; CHECK-NEXT: whilelo p1.d, x8, x9
+; CHECK-NEXT: b.mi .LBB13_2
+; CHECK-NEXT: .LBB13_3: // %for.cond.cleanup
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_64:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: cmp w3, #1
+; CHECK-NOSVE2-NEXT: b.lt .LBB13_3
+; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT: sub x9, x0, x2
+; CHECK-NOSVE2-NEXT: mov x8, xzr
+; CHECK-NOSVE2-NEXT: add x10, x9, #7
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
+; CHECK-NOSVE2-NEXT: cmn x9, #7
+; CHECK-NOSVE2-NEXT: asr x9, x10, #3
+; CHECK-NOSVE2-NEXT: cset w10, lt
+; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9
+; CHECK-NOSVE2-NEXT: sub x9, x1, x2
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10
+; CHECK-NOSVE2-NEXT: add x10, x9, #7
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
+; CHECK-NOSVE2-NEXT: cmn x9, #7
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: asr x10, x10, #3
+; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p3.d, xzr, x10
+; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x9
+; CHECK-NOSVE2-NEXT: mov w9, w3
+; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b
+; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9
+; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.d
+; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
+; CHECK-NOSVE2-NEXT: .LBB13_2: // %vector.body
+; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NOSVE2-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: add x8, x8, x10
+; CHECK-NOSVE2-NEXT: whilelo p1.d, x8, x9
+; CHECK-NOSVE2-NEXT: b.mi .LBB13_2
+; CHECK-NOSVE2-NEXT: .LBB13_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %cmp9 = icmp sgt i32 %n, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %c12 = ptrtoint ptr %c to i64
+ %a13 = ptrtoint ptr %a to i64
+ %b14 = ptrtoint ptr %b to i64
+ %wide.trip.count = zext nneg i32 %n to i64
+ %sub.diff = sub i64 %a13, %c12
+ %diff = sdiv i64 %sub.diff, 8
+ %neg.compare = icmp slt i64 %sub.diff, -7
+ %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
+ %sub.diff16 = sub i64 %b14, %c12
+ %diff17 = sdiv i64 %sub.diff16, 8
+ %neg.compare18 = icmp slt i64 %sub.diff16, -7
+ %.splatinsert19 = insertelement <vscale x 2 x i1> poison, i1 %neg.compare18, i64 0
+ %.splat20 = shufflevector <vscale x 2 x i1> %.splatinsert19, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+ %ptr.diff.lane.mask21 = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff17)
+ %active.lane.mask.alias22 = or <vscale x 2 x i1> %ptr.diff.lane.mask21, %.splat20
+ %0 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.alias22
+ %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
+ %1 = zext <vscale x 2 x i1> %0 to <vscale x 2 x i8>
+ %2 = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %1)
+ %3 = zext i8 %2 to i64
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %4 = and <vscale x 2 x i1> %active.lane.mask, %0
+ %5 = getelementptr inbounds i64, ptr %a, i64 %index
+ %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison)
+ %6 = getelementptr inbounds i64, ptr %b, i64 %index
+ %wide.masked.load23 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %6, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison)
+ %7 = add <vscale x 2 x i64> %wide.masked.load23, %wide.masked.load
+ %8 = getelementptr inbounds i64, ptr %c, i64 %index
+ tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %7, ptr %8, i32 8, <vscale x 2 x i1> %4)
+ %index.next = add i64 %index, %3
+ %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
+ %9 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
+ br i1 %9, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+declare i64 @llvm.vscale.i64()
+
+declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64)
+
+declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>)
+
+declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>)
+
+declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64)
+
+declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, <vscale x 8 x i1>, <vscale x 8 x i16>)
+
+declare void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16>, ptr nocapture, i32 immarg, <vscale x 8 x i1>)
+
+declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
+
+declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>)
+
+declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, i32 immarg, <vscale x 4 x i1>)
+
+declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64)
+
+declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i64>)
+
+declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr nocapture, i32 immarg, <vscale x 2 x i1>)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
deleted file mode 100644
index 3662efa41c151..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ /dev/null
@@ -1,884 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve2 -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2
-define dso_local void @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB0_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: whilewr p0.b, x1, x2
-; CHECK-NEXT: mov w9, w3
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: whilelo p1.b, xzr, x9
-; CHECK-NEXT: cntp x10, p0, p0.b
-; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB0_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NEXT: add z0.b, z1.b, z0.b
-; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8]
-; CHECK-NEXT: add x8, x8, x10
-; CHECK-NEXT: whilelo p1.b, x8, x9
-; CHECK-NEXT: b.mi .LBB0_2
-; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_8:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB0_3
-; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT: sub x9, x1, x2
-; CHECK-NOSVE2-NEXT: mov x8, xzr
-; CHECK-NOSVE2-NEXT: cmp x9, #0
-; CHECK-NOSVE2-NEXT: cset w10, lt
-; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9
-; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
-; CHECK-NOSVE2-NEXT: mov w9, w3
-; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
-; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b
-; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT: .LBB0_2: // %vector.body
-; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b
-; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8]
-; CHECK-NOSVE2-NEXT: add x8, x8, x10
-; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB0_2
-; CHECK-NOSVE2-NEXT: .LBB0_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %cmp11 = icmp sgt i32 %n, 0
- br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- %c14 = ptrtoint ptr %c to i64
- %b15 = ptrtoint ptr %b to i64
- %wide.trip.count = zext nneg i32 %n to i64
- %sub.diff = sub i64 %b15, %c14
- %neg.compare = icmp slt i64 %sub.diff, 0
- %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
- %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
- %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
- %0 = zext <vscale x 16 x i1> %active.lane.mask.alias to <vscale x 16 x i8>
- %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %0)
- %2 = zext i8 %1 to i64
- br label %vector.body
-
-vector.body:
- %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
- %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
- %3 = and <vscale x 16 x i1> %active.lane.mask, %active.lane.mask.alias
- %4 = getelementptr inbounds i8, ptr %a, i64 %index
- %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
- %5 = getelementptr inbounds i8, ptr %b, i64 %index
- %wide.masked.load16 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
- %6 = add <vscale x 16 x i8> %wide.masked.load16, %wide.masked.load
- %7 = getelementptr inbounds i8, ptr %c, i64 %index
- tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %6, ptr %7, i32 1, <vscale x 16 x i1> %3)
- %index.next = add i64 %index, %2
- %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
- %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
- br i1 %8, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
- ret void
-}
-
-define dso_local void @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB1_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: mov w8, w3
-; CHECK-NEXT: whilewr p1.h, x1, x2
-; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: whilelo p0.h, xzr, x8
-; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: .LBB1_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
-; CHECK-NEXT: add z0.h, z1.h, z0.h
-; CHECK-NEXT: st1h { z0.h }, p0, [x2, x9, lsl #1]
-; CHECK-NEXT: inch x9
-; CHECK-NEXT: whilelo p0.h, x9, x8
-; CHECK-NEXT: b.mi .LBB1_2
-; CHECK-NEXT: .LBB1_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_16:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB1_3
-; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT: mov w9, w3
-; CHECK-NOSVE2-NEXT: sub x10, x1, x2
-; CHECK-NOSVE2-NEXT: mov x8, xzr
-; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9
-; CHECK-NOSVE2-NEXT: cmn x10, #1
-; CHECK-NOSVE2-NEXT: add x10, x10, x10, lsr #63
-; CHECK-NOSVE2-NEXT: cset w11, lt
-; CHECK-NOSVE2-NEXT: sbfx x11, x11, #0, #1
-; CHECK-NOSVE2-NEXT: asr x10, x10, #1
-; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x11
-; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10
-; CHECK-NOSVE2-NEXT: cnth x10
-; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT: .LBB1_2: // %vector.body
-; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
-; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h
-; CHECK-NOSVE2-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
-; CHECK-NOSVE2-NEXT: add x8, x8, x10
-; CHECK-NOSVE2-NEXT: whilelo p0.h, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB1_2
-; CHECK-NOSVE2-NEXT: .LBB1_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %cmp11 = icmp sgt i32 %n, 0
- br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- %b14 = ptrtoint ptr %b to i64
- %c15 = ptrtoint ptr %c to i64
- %wide.trip.count = zext nneg i32 %n to i64
- %0 = tail call i64 @llvm.vscale.i64()
- %1 = shl nuw nsw i64 %0, 3
- %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
- %sub.diff = sub i64 %b14, %c15
- %diff = sdiv i64 %sub.diff, 2
- %neg.compare = icmp slt i64 %sub.diff, -1
- %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
- %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
- %2 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.entry
- br label %vector.body
-
-vector.body:
- %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
- %active.lane.mask = phi <vscale x 8 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
- %3 = getelementptr inbounds i16, ptr %a, i64 %index
- %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
- %4 = getelementptr inbounds i16, ptr %b, i64 %index
- %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
- %5 = add <vscale x 8 x i16> %wide.masked.load16, %wide.masked.load
- %6 = getelementptr inbounds i16, ptr %c, i64 %index
- tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %5, ptr %6, i32 2, <vscale x 8 x i1> %active.lane.mask)
- %index.next = add i64 %index, %1
- %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
- %7 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
- br i1 %7, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
- ret void
-}
-
-define dso_local void @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB2_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: mov w8, w3
-; CHECK-NEXT: whilewr p1.s, x1, x2
-; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: whilelo p0.s, xzr, x8
-; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: .LBB2_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
-; CHECK-NEXT: add z0.s, z1.s, z0.s
-; CHECK-NEXT: st1w { z0.s }, p0, [x2, x9, lsl #2]
-; CHECK-NEXT: incw x9
-; CHECK-NEXT: whilelo p0.s, x9, x8
-; CHECK-NEXT: b.mi .LBB2_2
-; CHECK-NEXT: .LBB2_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_32:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB2_3
-; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT: mov w9, w3
-; CHECK-NOSVE2-NEXT: sub x10, x1, x2
-; CHECK-NOSVE2-NEXT: mov x8, xzr
-; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9
-; CHECK-NOSVE2-NEXT: add x11, x10, #3
-; CHECK-NOSVE2-NEXT: cmp x10, #0
-; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt
-; CHECK-NOSVE2-NEXT: cmn x10, #3
-; CHECK-NOSVE2-NEXT: cset w10, lt
-; CHECK-NOSVE2-NEXT: asr x11, x11, #2
-; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x11
-; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10
-; CHECK-NOSVE2-NEXT: cntw x10
-; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT: .LBB2_2: // %vector.body
-; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
-; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s
-; CHECK-NOSVE2-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
-; CHECK-NOSVE2-NEXT: add x8, x8, x10
-; CHECK-NOSVE2-NEXT: whilelo p0.s, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB2_2
-; CHECK-NOSVE2-NEXT: .LBB2_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %cmp9 = icmp sgt i32 %n, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- %b12 = ptrtoint ptr %b to i64
- %c13 = ptrtoint ptr %c to i64
- %wide.trip.count = zext nneg i32 %n to i64
- %0 = tail call i64 @llvm.vscale.i64()
- %1 = shl nuw nsw i64 %0, 2
- %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
- %sub.diff = sub i64 %b12, %c13
- %diff = sdiv i64 %sub.diff, 4
- %neg.compare = icmp slt i64 %sub.diff, -3
- %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
- %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
- %2 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.entry
- br label %vector.body
-
-vector.body:
- %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
- %active.lane.mask = phi <vscale x 4 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
- %3 = getelementptr inbounds i32, ptr %a, i64 %index
- %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
- %4 = getelementptr inbounds i32, ptr %b, i64 %index
- %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
- %5 = add <vscale x 4 x i32> %wide.masked.load14, %wide.masked.load
- %6 = getelementptr inbounds i32, ptr %c, i64 %index
- tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask)
- %index.next = add i64 %index, %1
- %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
- %7 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
- br i1 %7, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
- ret void
-}
-
-define dso_local void @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB3_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: mov w8, w3
-; CHECK-NEXT: whilewr p1.d, x1, x2
-; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: whilelo p0.d, xzr, x8
-; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: .LBB3_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: st1d { z0.d }, p0, [x2, x9, lsl #3]
-; CHECK-NEXT: incd x9
-; CHECK-NEXT: whilelo p0.d, x9, x8
-; CHECK-NEXT: b.mi .LBB3_2
-; CHECK-NEXT: .LBB3_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_64:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB3_3
-; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT: mov w9, w3
-; CHECK-NOSVE2-NEXT: sub x10, x1, x2
-; CHECK-NOSVE2-NEXT: mov x8, xzr
-; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9
-; CHECK-NOSVE2-NEXT: add x11, x10, #7
-; CHECK-NOSVE2-NEXT: cmp x10, #0
-; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt
-; CHECK-NOSVE2-NEXT: cmn x10, #7
-; CHECK-NOSVE2-NEXT: cset w10, lt
-; CHECK-NOSVE2-NEXT: asr x11, x11, #3
-; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x11
-; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10
-; CHECK-NOSVE2-NEXT: cntd x10
-; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT: .LBB3_2: // %vector.body
-; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
-; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NOSVE2-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
-; CHECK-NOSVE2-NEXT: add x8, x8, x10
-; CHECK-NOSVE2-NEXT: whilelo p0.d, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB3_2
-; CHECK-NOSVE2-NEXT: .LBB3_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %cmp9 = icmp sgt i32 %n, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- %b12 = ptrtoint ptr %b to i64
- %c13 = ptrtoint ptr %c to i64
- %wide.trip.count = zext nneg i32 %n to i64
- %0 = tail call i64 @llvm.vscale.i64()
- %1 = shl nuw nsw i64 %0, 1
- %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
- %sub.diff = sub i64 %b12, %c13
- %diff = sdiv i64 %sub.diff, 8
- %neg.compare = icmp slt i64 %sub.diff, -7
- %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
- %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
- %2 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.entry
- br label %vector.body
-
-vector.body:
- %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
- %active.lane.mask = phi <vscale x 2 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
- %3 = getelementptr inbounds i64, ptr %a, i64 %index
- %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
- %4 = getelementptr inbounds i64, ptr %b, i64 %index
- %wide.masked.load14 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
- %5 = add <vscale x 2 x i64> %wide.masked.load14, %wide.masked.load
- %6 = getelementptr inbounds i64, ptr %c, i64 %index
- tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %5, ptr %6, i32 8, <vscale x 2 x i1> %active.lane.mask)
- %index.next = add i64 %index, %1
- %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
- %7 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
- br i1 %7, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
- ret void
-}
-
-define dso_local void @whilewr_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_multiple_8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB4_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: whilewr p0.b, x0, x2
-; CHECK-NEXT: mov w9, w3
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: whilewr p1.b, x1, x2
-; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT: whilelo p1.b, xzr, x9
-; CHECK-NEXT: cntp x10, p0, p0.b
-; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB4_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NEXT: add z0.b, z1.b, z0.b
-; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8]
-; CHECK-NEXT: add x8, x8, x10
-; CHECK-NEXT: whilelo p1.b, x8, x9
-; CHECK-NEXT: b.mi .LBB4_2
-; CHECK-NEXT: .LBB4_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_multiple_8:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB4_3
-; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT: sub x9, x0, x2
-; CHECK-NOSVE2-NEXT: mov x8, xzr
-; CHECK-NOSVE2-NEXT: cmp x9, #0
-; CHECK-NOSVE2-NEXT: cset w10, lt
-; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9
-; CHECK-NOSVE2-NEXT: sub x9, x1, x2
-; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x10
-; CHECK-NOSVE2-NEXT: cmp x9, #0
-; CHECK-NOSVE2-NEXT: cset w10, lt
-; CHECK-NOSVE2-NEXT: whilelo p3.b, xzr, x9
-; CHECK-NOSVE2-NEXT: mov w9, w3
-; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: whilelo p2.b, xzr, x10
-; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b
-; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
-; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b
-; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT: .LBB4_2: // %vector.body
-; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b
-; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8]
-; CHECK-NOSVE2-NEXT: add x8, x8, x10
-; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB4_2
-; CHECK-NOSVE2-NEXT: .LBB4_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %cmp11 = icmp sgt i32 %n, 0
- br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- %c14 = ptrtoint ptr %c to i64
- %a15 = ptrtoint ptr %a to i64
- %b16 = ptrtoint ptr %b to i64
- %wide.trip.count = zext nneg i32 %n to i64
- %sub.diff = sub i64 %a15, %c14
- %neg.compare = icmp slt i64 %sub.diff, 0
- %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
- %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
- %sub.diff18 = sub i64 %b16, %c14
- %neg.compare20 = icmp slt i64 %sub.diff18, 0
- %.splatinsert21 = insertelement <vscale x 16 x i1> poison, i1 %neg.compare20, i64 0
- %.splat22 = shufflevector <vscale x 16 x i1> %.splatinsert21, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
- %ptr.diff.lane.mask23 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff18)
- %active.lane.mask.alias24 = or <vscale x 16 x i1> %ptr.diff.lane.mask23, %.splat22
- %0 = and <vscale x 16 x i1> %active.lane.mask.alias, %active.lane.mask.alias24
- %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
- %1 = zext <vscale x 16 x i1> %0 to <vscale x 16 x i8>
- %2 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %1)
- %3 = zext i8 %2 to i64
- br label %vector.body
-
-vector.body:
- %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
- %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
- %4 = and <vscale x 16 x i1> %active.lane.mask, %0
- %5 = getelementptr inbounds i8, ptr %a, i64 %index
- %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %4, <vscale x 16 x i8> poison)
- %6 = getelementptr inbounds i8, ptr %b, i64 %index
- %wide.masked.load25 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %6, i32 1, <vscale x 16 x i1> %4, <vscale x 16 x i8> poison)
- %7 = add <vscale x 16 x i8> %wide.masked.load25, %wide.masked.load
- %8 = getelementptr inbounds i8, ptr %c, i64 %index
- tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %7, ptr %8, i32 1, <vscale x 16 x i1> %4)
- %index.next = add i64 %index, %3
- %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
- %9 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
- br i1 %9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
- ret void
-}
-
-define dso_local void @whilewr_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_multiple_16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB5_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: whilewr p0.h, x0, x2
-; CHECK-NEXT: mov w9, w3
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: whilewr p1.h, x1, x2
-; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT: whilelo p1.h, xzr, x9
-; CHECK-NEXT: cntp x10, p0, p0.h
-; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB5_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
-; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
-; CHECK-NEXT: add z0.h, z1.h, z0.h
-; CHECK-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1]
-; CHECK-NEXT: add x8, x8, x10
-; CHECK-NEXT: whilelo p1.h, x8, x9
-; CHECK-NEXT: b.mi .LBB5_2
-; CHECK-NEXT: .LBB5_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_multiple_16:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB5_3
-; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT: sub x9, x0, x2
-; CHECK-NOSVE2-NEXT: mov x8, xzr
-; CHECK-NOSVE2-NEXT: cmn x9, #1
-; CHECK-NOSVE2-NEXT: add x9, x9, x9, lsr #63
-; CHECK-NOSVE2-NEXT: cset w10, lt
-; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT: asr x9, x9, #1
-; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x10
-; CHECK-NOSVE2-NEXT: sub x10, x1, x2
-; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9
-; CHECK-NOSVE2-NEXT: add x9, x10, x10, lsr #63
-; CHECK-NOSVE2-NEXT: cmn x10, #1
-; CHECK-NOSVE2-NEXT: cset w10, lt
-; CHECK-NOSVE2-NEXT: asr x9, x9, #1
-; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p3.h, xzr, x9
-; CHECK-NOSVE2-NEXT: mov w9, w3
-; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10
-; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b
-; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9
-; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.h
-; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT: .LBB5_2: // %vector.body
-; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
-; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
-; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h
-; CHECK-NOSVE2-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1]
-; CHECK-NOSVE2-NEXT: add x8, x8, x10
-; CHECK-NOSVE2-NEXT: whilelo p1.h, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB5_2
-; CHECK-NOSVE2-NEXT: .LBB5_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %cmp11 = icmp sgt i32 %n, 0
- br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- %c14 = ptrtoint ptr %c to i64
- %a15 = ptrtoint ptr %a to i64
- %b16 = ptrtoint ptr %b to i64
- %wide.trip.count = zext nneg i32 %n to i64
- %sub.diff = sub i64 %a15, %c14
- %diff = sdiv i64 %sub.diff, 2
- %neg.compare = icmp slt i64 %sub.diff, -1
- %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
- %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
- %sub.diff18 = sub i64 %b16, %c14
- %diff19 = sdiv i64 %sub.diff18, 2
- %neg.compare20 = icmp slt i64 %sub.diff18, -1
- %.splatinsert21 = insertelement <vscale x 8 x i1> poison, i1 %neg.compare20, i64 0
- %.splat22 = shufflevector <vscale x 8 x i1> %.splatinsert21, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
- %ptr.diff.lane.mask23 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff19)
- %active.lane.mask.alias24 = or <vscale x 8 x i1> %ptr.diff.lane.mask23, %.splat22
- %0 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.alias24
- %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
- %1 = zext <vscale x 8 x i1> %0 to <vscale x 8 x i8>
- %2 = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> %1)
- %3 = zext i8 %2 to i64
- br label %vector.body
-
-vector.body:
- %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
- %active.lane.mask = phi <vscale x 8 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
- %4 = and <vscale x 8 x i1> %active.lane.mask, %0
- %5 = getelementptr inbounds i16, ptr %a, i64 %index
- %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison)
- %6 = getelementptr inbounds i16, ptr %b, i64 %index
- %wide.masked.load25 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %6, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison)
- %7 = add <vscale x 8 x i16> %wide.masked.load25, %wide.masked.load
- %8 = getelementptr inbounds i16, ptr %c, i64 %index
- tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %7, ptr %8, i32 2, <vscale x 8 x i1> %4)
- %index.next = add i64 %index, %3
- %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
- %9 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
- br i1 %9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
- ret void
-}
-
-define dso_local void @whilewr_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_multiple_32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB6_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: whilewr p0.s, x0, x2
-; CHECK-NEXT: mov w9, w3
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: whilewr p1.s, x1, x2
-; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT: whilelo p1.s, xzr, x9
-; CHECK-NEXT: cntp x10, p0, p0.s
-; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB6_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
-; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
-; CHECK-NEXT: add z0.s, z1.s, z0.s
-; CHECK-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2]
-; CHECK-NEXT: add x8, x8, x10
-; CHECK-NEXT: whilelo p1.s, x8, x9
-; CHECK-NEXT: b.mi .LBB6_2
-; CHECK-NEXT: .LBB6_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_multiple_32:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB6_3
-; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT: sub x9, x0, x2
-; CHECK-NOSVE2-NEXT: mov x8, xzr
-; CHECK-NOSVE2-NEXT: add x10, x9, #3
-; CHECK-NOSVE2-NEXT: cmp x9, #0
-; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT: cmn x9, #3
-; CHECK-NOSVE2-NEXT: asr x9, x10, #2
-; CHECK-NOSVE2-NEXT: cset w10, lt
-; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9
-; CHECK-NOSVE2-NEXT: sub x9, x1, x2
-; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10
-; CHECK-NOSVE2-NEXT: add x10, x9, #3
-; CHECK-NOSVE2-NEXT: cmp x9, #0
-; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT: cmn x9, #3
-; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: cset w9, lt
-; CHECK-NOSVE2-NEXT: asr x10, x10, #2
-; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p3.s, xzr, x10
-; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x9
-; CHECK-NOSVE2-NEXT: mov w9, w3
-; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b
-; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9
-; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.s
-; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT: .LBB6_2: // %vector.body
-; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
-; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
-; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s
-; CHECK-NOSVE2-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2]
-; CHECK-NOSVE2-NEXT: add x8, x8, x10
-; CHECK-NOSVE2-NEXT: whilelo p1.s, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB6_2
-; CHECK-NOSVE2-NEXT: .LBB6_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %cmp9 = icmp sgt i32 %n, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- %c12 = ptrtoint ptr %c to i64
- %a13 = ptrtoint ptr %a to i64
- %b14 = ptrtoint ptr %b to i64
- %wide.trip.count = zext nneg i32 %n to i64
- %sub.diff = sub i64 %a13, %c12
- %diff = sdiv i64 %sub.diff, 4
- %neg.compare = icmp slt i64 %sub.diff, -3
- %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
- %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
- %sub.diff16 = sub i64 %b14, %c12
- %diff17 = sdiv i64 %sub.diff16, 4
- %neg.compare18 = icmp slt i64 %sub.diff16, -3
- %.splatinsert19 = insertelement <vscale x 4 x i1> poison, i1 %neg.compare18, i64 0
- %.splat20 = shufflevector <vscale x 4 x i1> %.splatinsert19, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
- %ptr.diff.lane.mask21 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff17)
- %active.lane.mask.alias22 = or <vscale x 4 x i1> %ptr.diff.lane.mask21, %.splat20
- %0 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.alias22
- %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
- %1 = zext <vscale x 4 x i1> %0 to <vscale x 4 x i8>
- %2 = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %1)
- %3 = zext i8 %2 to i64
- br label %vector.body
-
-vector.body:
- %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
- %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
- %4 = and <vscale x 4 x i1> %active.lane.mask, %0
- %5 = getelementptr inbounds i32, ptr %a, i64 %index
- %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison)
- %6 = getelementptr inbounds i32, ptr %b, i64 %index
- %wide.masked.load23 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %6, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison)
- %7 = add <vscale x 4 x i32> %wide.masked.load23, %wide.masked.load
- %8 = getelementptr inbounds i32, ptr %c, i64 %index
- tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %7, ptr %8, i32 4, <vscale x 4 x i1> %4)
- %index.next = add i64 %index, %3
- %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
- %9 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
- br i1 %9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
- ret void
-}
-
-define dso_local void @whilewr_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_multiple_64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB7_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: whilewr p0.d, x0, x2
-; CHECK-NEXT: mov w9, w3
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: whilewr p1.d, x1, x2
-; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT: whilelo p1.d, xzr, x9
-; CHECK-NEXT: cntp x10, p0, p0.d
-; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB7_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
-; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3]
-; CHECK-NEXT: add x8, x8, x10
-; CHECK-NEXT: whilelo p1.d, x8, x9
-; CHECK-NEXT: b.mi .LBB7_2
-; CHECK-NEXT: .LBB7_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_multiple_64:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB7_3
-; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT: sub x9, x0, x2
-; CHECK-NOSVE2-NEXT: mov x8, xzr
-; CHECK-NOSVE2-NEXT: add x10, x9, #7
-; CHECK-NOSVE2-NEXT: cmp x9, #0
-; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT: cmn x9, #7
-; CHECK-NOSVE2-NEXT: asr x9, x10, #3
-; CHECK-NOSVE2-NEXT: cset w10, lt
-; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9
-; CHECK-NOSVE2-NEXT: sub x9, x1, x2
-; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10
-; CHECK-NOSVE2-NEXT: add x10, x9, #7
-; CHECK-NOSVE2-NEXT: cmp x9, #0
-; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT: cmn x9, #7
-; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: cset w9, lt
-; CHECK-NOSVE2-NEXT: asr x10, x10, #3
-; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p3.d, xzr, x10
-; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x9
-; CHECK-NOSVE2-NEXT: mov w9, w3
-; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b
-; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9
-; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.d
-; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT: .LBB7_2: // %vector.body
-; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
-; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NOSVE2-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3]
-; CHECK-NOSVE2-NEXT: add x8, x8, x10
-; CHECK-NOSVE2-NEXT: whilelo p1.d, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB7_2
-; CHECK-NOSVE2-NEXT: .LBB7_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %cmp9 = icmp sgt i32 %n, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- %c12 = ptrtoint ptr %c to i64
- %a13 = ptrtoint ptr %a to i64
- %b14 = ptrtoint ptr %b to i64
- %wide.trip.count = zext nneg i32 %n to i64
- %sub.diff = sub i64 %a13, %c12
- %diff = sdiv i64 %sub.diff, 8
- %neg.compare = icmp slt i64 %sub.diff, -7
- %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
- %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
- %sub.diff16 = sub i64 %b14, %c12
- %diff17 = sdiv i64 %sub.diff16, 8
- %neg.compare18 = icmp slt i64 %sub.diff16, -7
- %.splatinsert19 = insertelement <vscale x 2 x i1> poison, i1 %neg.compare18, i64 0
- %.splat20 = shufflevector <vscale x 2 x i1> %.splatinsert19, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
- %ptr.diff.lane.mask21 = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff17)
- %active.lane.mask.alias22 = or <vscale x 2 x i1> %ptr.diff.lane.mask21, %.splat20
- %0 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.alias22
- %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
- %1 = zext <vscale x 2 x i1> %0 to <vscale x 2 x i8>
- %2 = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %1)
- %3 = zext i8 %2 to i64
- br label %vector.body
-
-vector.body:
- %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
- %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
- %4 = and <vscale x 2 x i1> %active.lane.mask, %0
- %5 = getelementptr inbounds i64, ptr %a, i64 %index
- %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison)
- %6 = getelementptr inbounds i64, ptr %b, i64 %index
- %wide.masked.load23 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %6, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison)
- %7 = add <vscale x 2 x i64> %wide.masked.load23, %wide.masked.load
- %8 = getelementptr inbounds i64, ptr %c, i64 %index
- tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %7, ptr %8, i32 8, <vscale x 2 x i1> %4)
- %index.next = add i64 %index, %3
- %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
- %9 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
- br i1 %9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
- ret void
-}
-
-declare i64 @llvm.vscale.i64() #1
-
-declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) #1
-
-declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>) #2
-
-declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>) #3
-
-declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64) #1
-
-declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, <vscale x 8 x i1>, <vscale x 8 x i16>) #2
-
-declare void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16>, ptr nocapture, i32 immarg, <vscale x 8 x i1>) #3
-
-declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) #1
-
-declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>) #2
-
-declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, i32 immarg, <vscale x 4 x i1>) #3
-
-declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) #1
-
-declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i64>) #2
-
-declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr nocapture, i32 immarg, <vscale x 2 x i1>) #3
-
-attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+outline-atomics,+pauth,+ras,+rcpc,+rdm,+sme,+sme2,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
-attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
-attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
More information about the llvm-commits
mailing list