[llvm] [AArch64] Lower alias mask to a whilewr (PR #100769)

Fri Jul 26 09:12:03 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Sam Tebbs (SamTebbs33)

<details>
<summary>Changes</summary>

https://github.com/llvm/llvm-project/pull/100579 emits IR that creates a mask disabling lanes that could alias within a loop iteration, based on a pair of pointers. This PR lowers that IR to the WHILEWR instruction for AArch64.

---

Patch is 47.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100769.diff


2 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+82) 
- (added) llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll (+884) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d86e52d49000a..c2e9ba6291855 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -94,6 +94,7 @@
 #include <bitset>
 #include <cassert>
 #include <cctype>
+#include <cmath>
 #include <cstdint>
 #include <cstdlib>
 #include <iterator>
@@ -1523,6 +1524,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+      setOperationAction(ISD::OR, VT, Custom);
 
       setOperationAction(ISD::SELECT_CC, VT, Expand);
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
@@ -13782,8 +13784,88 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
   return ResultSLI;
 }
 
+/// Try to lower the construction of a pointer alias mask to a WHILEWR.
+/// The mask's enabled lanes represent the elements that will not overlap across one loop iteration.
+/// This tries to match:
+/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))),
+///    (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size))
+SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
+  if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE2())
+    return SDValue();
+  auto LaneMask = Op.getOperand(0);
+  auto Splat = Op.getOperand(1);
+
+  if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
+      LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask ||
+      Splat.getOpcode() != ISD::SPLAT_VECTOR)
+    return SDValue();
+
+  auto Cmp = Splat.getOperand(0);
+  if (Cmp.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  CondCodeSDNode *Cond = dyn_cast<CondCodeSDNode>(Cmp.getOperand(2));
+  assert(Cond && "SETCC doesn't have a condition code");
+
+  auto ComparatorConst = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
+  if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 ||
+      Cond->get() != ISD::CondCode::SETLT)
+    return SDValue();
+  unsigned CompValue = std::abs(ComparatorConst->getSExtValue());
+  unsigned EltSize = CompValue + 1;
+  if (!isPowerOf2_64(EltSize) || EltSize > 64)
+    return SDValue();
+
+  auto Diff = Cmp.getOperand(0);
+  if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64)
+    return SDValue();
+
+  auto LaneMaskConst = dyn_cast<ConstantSDNode>(LaneMask.getOperand(1));
+  if (!LaneMaskConst || LaneMaskConst->getZExtValue() != 0 ||
+      (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA))
+    return SDValue();
+
+  // An alias mask for i8 elements omits the division because it would just divide by 1
+  if (EltSize > 1) {
+    auto DiffDiv = LaneMask.getOperand(2);
+    auto DiffDivConst = dyn_cast<ConstantSDNode>(DiffDiv.getOperand(1));
+    if (!DiffDivConst || DiffDivConst->getZExtValue() != std::log2(EltSize))
+      return SDValue();
+  } else if (LaneMask.getOperand(2) != Diff)
+    return SDValue();
+
+  auto StorePtr = Diff.getOperand(0);
+  auto ReadPtr = Diff.getOperand(1);
+
+  unsigned IntrinsicID = 0;
+  switch (EltSize) {
+  case 1:
+    IntrinsicID = Intrinsic::aarch64_sve_whilewr_b;
+    break;
+  case 2:
+    IntrinsicID = Intrinsic::aarch64_sve_whilewr_h;
+    break;
+  case 4:
+    IntrinsicID = Intrinsic::aarch64_sve_whilewr_s;
+    break;
+  case 8:
+    IntrinsicID = Intrinsic::aarch64_sve_whilewr_d;
+    break;
+  default:
+    return SDValue();
+  }
+  SDLoc DL(Op);
+  SDValue ID = DAG.getConstant(IntrinsicID, DL, MVT::i32);
+  auto N = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID,
+                       StorePtr, ReadPtr);
+  return N;
+}
+
 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
                                              SelectionDAG &DAG) const {
+
+  if (SDValue SV = tryWhileWRFromOR(Op, DAG))
+    return SV;
   if (useSVEForFixedLengthVectorVT(Op.getValueType(),
                                    !Subtarget->isNeonAvailable()))
     return LowerToScalableOp(Op, DAG);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
new file mode 100644
index 0000000000000..3662efa41c151
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -0,0 +1,884 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve2 -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2
+define dso_local void @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w3, #1
+; CHECK-NEXT:    b.lt .LBB0_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    whilewr p0.b, x1, x2
+; CHECK-NEXT:    mov w9, w3
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-NEXT:    cntp x10, p0, p0.b
+; CHECK-NEXT:    and x10, x10, #0xff
+; CHECK-NEXT:  .LBB0_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-NEXT:    add z0.b, z1.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p1, [x2, x8]
+; CHECK-NEXT:    add x8, x8, x10
+; CHECK-NEXT:    whilelo p1.b, x8, x9
+; CHECK-NEXT:    b.mi .LBB0_2
+; CHECK-NEXT:  .LBB0_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_8:
+; CHECK-NOSVE2:       // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT:    cmp w3, #1
+; CHECK-NOSVE2-NEXT:    b.lt .LBB0_3
+; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
+; CHECK-NOSVE2-NEXT:    mov x8, xzr
+; CHECK-NOSVE2-NEXT:    cmp x9, #0
+; CHECK-NOSVE2-NEXT:    cset w10, lt
+; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x9
+; CHECK-NOSVE2-NEXT:    sbfx x9, x10, #0, #1
+; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-NOSVE2-NEXT:    mov w9, w3
+; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.b
+; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
+; CHECK-NOSVE2-NEXT:  .LBB0_2: // %vector.body
+; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-NOSVE2-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-NOSVE2-NEXT:    add z0.b, z1.b, z0.b
+; CHECK-NOSVE2-NEXT:    st1b { z0.b }, p1, [x2, x8]
+; CHECK-NOSVE2-NEXT:    add x8, x8, x10
+; CHECK-NOSVE2-NEXT:    whilelo p1.b, x8, x9
+; CHECK-NOSVE2-NEXT:    b.mi .LBB0_2
+; CHECK-NOSVE2-NEXT:  .LBB0_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT:    ret
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %c14 = ptrtoint ptr %c to i64
+  %b15 = ptrtoint ptr %b to i64
+  %wide.trip.count = zext nneg i32 %n to i64
+  %sub.diff = sub i64 %b15, %c14
+  %neg.compare = icmp slt i64 %sub.diff, 0
+  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
+  %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
+  %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+  %0 = zext <vscale x 16 x i1> %active.lane.mask.alias to <vscale x 16 x i8>
+  %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %0)
+  %2 = zext i8 %1 to i64
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %3 = and <vscale x 16 x i1> %active.lane.mask, %active.lane.mask.alias
+  %4 = getelementptr inbounds i8, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
+  %5 = getelementptr inbounds i8, ptr %b, i64 %index
+  %wide.masked.load16 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
+  %6 = add <vscale x 16 x i8> %wide.masked.load16, %wide.masked.load
+  %7 = getelementptr inbounds i8, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %6, ptr %7, i32 1, <vscale x 16 x i1> %3)
+  %index.next = add i64 %index, %2
+  %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
+  br i1 %8, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+define dso_local void @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w3, #1
+; CHECK-NEXT:    b.lt .LBB1_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w8, w3
+; CHECK-NEXT:    whilewr p1.h, x1, x2
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT:  .LBB1_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
+; CHECK-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2, x9, lsl #1]
+; CHECK-NEXT:    inch x9
+; CHECK-NEXT:    whilelo p0.h, x9, x8
+; CHECK-NEXT:    b.mi .LBB1_2
+; CHECK-NEXT:  .LBB1_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_16:
+; CHECK-NOSVE2:       // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT:    cmp w3, #1
+; CHECK-NOSVE2-NEXT:    b.lt .LBB1_3
+; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT:    mov w9, w3
+; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
+; CHECK-NOSVE2-NEXT:    mov x8, xzr
+; CHECK-NOSVE2-NEXT:    whilelo p0.h, xzr, x9
+; CHECK-NOSVE2-NEXT:    cmn x10, #1
+; CHECK-NOSVE2-NEXT:    add x10, x10, x10, lsr #63
+; CHECK-NOSVE2-NEXT:    cset w11, lt
+; CHECK-NOSVE2-NEXT:    sbfx x11, x11, #0, #1
+; CHECK-NOSVE2-NEXT:    asr x10, x10, #1
+; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x11
+; CHECK-NOSVE2-NEXT:    whilelo p2.h, xzr, x10
+; CHECK-NOSVE2-NEXT:    cnth x10
+; CHECK-NOSVE2-NEXT:    mov p1.b, p2/m, p2.b
+; CHECK-NOSVE2-NEXT:    and p0.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT:  .LBB1_2: // %vector.body
+; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; CHECK-NOSVE2-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
+; CHECK-NOSVE2-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-NOSVE2-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
+; CHECK-NOSVE2-NEXT:    add x8, x8, x10
+; CHECK-NOSVE2-NEXT:    whilelo p0.h, x8, x9
+; CHECK-NOSVE2-NEXT:    b.mi .LBB1_2
+; CHECK-NOSVE2-NEXT:  .LBB1_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT:    ret
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %b14 = ptrtoint ptr %b to i64
+  %c15 = ptrtoint ptr %c to i64
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 3
+  %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
+  %sub.diff = sub i64 %b14, %c15
+  %diff = sdiv i64 %sub.diff, 2
+  %neg.compare = icmp slt i64 %sub.diff, -1
+  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
+  %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+  %2 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.entry
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 8 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %3 = getelementptr inbounds i16, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
+  %4 = getelementptr inbounds i16, ptr %b, i64 %index
+  %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
+  %5 = add <vscale x 8 x i16> %wide.masked.load16, %wide.masked.load
+  %6 = getelementptr inbounds i16, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %5, ptr %6, i32 2, <vscale x 8 x i1> %active.lane.mask)
+  %index.next = add i64 %index, %1
+  %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %7 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
+  br i1 %7, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+define dso_local void @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w3, #1
+; CHECK-NEXT:    b.lt .LBB2_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w8, w3
+; CHECK-NEXT:    whilewr p1.s, x1, x2
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT:  .LBB2_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
+; CHECK-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2, x9, lsl #2]
+; CHECK-NEXT:    incw x9
+; CHECK-NEXT:    whilelo p0.s, x9, x8
+; CHECK-NEXT:    b.mi .LBB2_2
+; CHECK-NEXT:  .LBB2_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_32:
+; CHECK-NOSVE2:       // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT:    cmp w3, #1
+; CHECK-NOSVE2-NEXT:    b.lt .LBB2_3
+; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT:    mov w9, w3
+; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
+; CHECK-NOSVE2-NEXT:    mov x8, xzr
+; CHECK-NOSVE2-NEXT:    whilelo p0.s, xzr, x9
+; CHECK-NOSVE2-NEXT:    add x11, x10, #3
+; CHECK-NOSVE2-NEXT:    cmp x10, #0
+; CHECK-NOSVE2-NEXT:    csel x11, x11, x10, lt
+; CHECK-NOSVE2-NEXT:    cmn x10, #3
+; CHECK-NOSVE2-NEXT:    cset w10, lt
+; CHECK-NOSVE2-NEXT:    asr x11, x11, #2
+; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT:    whilelo p2.s, xzr, x11
+; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x10
+; CHECK-NOSVE2-NEXT:    cntw x10
+; CHECK-NOSVE2-NEXT:    mov p1.b, p2/m, p2.b
+; CHECK-NOSVE2-NEXT:    and p0.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT:  .LBB2_2: // %vector.body
+; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; CHECK-NOSVE2-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
+; CHECK-NOSVE2-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NOSVE2-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
+; CHECK-NOSVE2-NEXT:    add x8, x8, x10
+; CHECK-NOSVE2-NEXT:    whilelo p0.s, x8, x9
+; CHECK-NOSVE2-NEXT:    b.mi .LBB2_2
+; CHECK-NOSVE2-NEXT:  .LBB2_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT:    ret
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %b12 = ptrtoint ptr %b to i64
+  %c13 = ptrtoint ptr %c to i64
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
+  %sub.diff = sub i64 %b12, %c13
+  %diff = sdiv i64 %sub.diff, 4
+  %neg.compare = icmp slt i64 %sub.diff, -3
+  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
+  %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+  %2 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.entry
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 4 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %3 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
+  %4 = getelementptr inbounds i32, ptr %b, i64 %index
+  %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
+  %5 = add <vscale x 4 x i32> %wide.masked.load14, %wide.masked.load
+  %6 = getelementptr inbounds i32, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask)
+  %index.next = add i64 %index, %1
+  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %7 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
+  br i1 %7, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+define dso_local void @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilewr_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w3, #1
+; CHECK-NEXT:    b.lt .LBB3_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w8, w3
+; CHECK-NEXT:    whilewr p1.d, x1, x2
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT:  .LBB3_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2, x9, lsl #3]
+; CHECK-NEXT:    incd x9
+; CHECK-NEXT:    whilelo p0.d, x9, x8
+; CHECK-NEXT:    b.mi .LBB3_2
+; CHECK-NEXT:  .LBB3_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+;
+; CHECK-NOSVE2-LABEL: whilewr_64:
+; CHECK-NOSVE2:       // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT:    cmp w3, #1
+; CHECK-NOSVE2-NEXT:    b.lt .LBB3_3
+; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NOSVE2-NEXT:    mov w9, w3
+; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
+; CHECK-NOSVE2-NEXT:    mov x8, xzr
+; CHECK-NOSVE2-NEXT:    whilelo p0.d, xzr, x9
+; CHECK-NOSVE2-NEXT:    add x11, x10, #7
+; CHECK-NOSVE2-NEXT:    cmp x10, #0
+; CHECK-NOSVE2-NEXT:    csel x11, x11, x10, lt
+; CHECK-NOSVE2-NEXT:    cmn x10, #7
+; CHECK-NOSVE2-NEXT:    cset w10, lt
+; CHECK-NOSVE2-NEXT:    asr x11, x11, #3
+; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT:    whilelo p2.d, xzr, x11
+; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x10
+; CHECK-NOSVE2-NEXT:    cntd x10
+; CHECK-NOSVE2-NEXT:    mov p1.b, p2/m, p2.b
+; CHECK-NOSVE2-NEXT:    and p0.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT:  .LBB3_2: // %vector.body
+; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NOSVE2-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; CHECK-NOSVE2-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; CHECK-NOSVE2-...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/100769