[llvm] [LV] Mask off possibly aliasing vector lanes (PR #100579)

Thu Nov 21 09:16:57 PST 2024

https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/100579

>From dc3dba57e8cd76191ba03270d512f9931d4f4600 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 15 Nov 2024 10:24:46 +0000
Subject: [PATCH 01/12] [Intrinsics][AArch64] Add intrinsic to mask off
 aliasing vector lanes

It can be unsafe to load a vector from an address and write a vector to
an address if those two addresses have overlapping lanes within a
vectorised loop iteration.

This PR adds an intrinsic designed to create a mask with lanes disabled
if they overlap between the two pointer arguments, so that only safe
lanes are loaded, operated on and stored.

Along with the two pointer parameters, the intrinsic also takes an
immediate that represents the size in bytes of the vector element
types, as well as an immediate i1 that is true if there is a write
after-read-hazard or false if there is a read-after-write hazard.

This will be used by #100579 and replaces the existing lowering for
whilewr since that isn't needed now we have the intrinsic.
---
 llvm/docs/LangRef.rst                         |   80 ++
 llvm/include/llvm/CodeGen/TargetLowering.h    |    5 +
 llvm/include/llvm/IR/Intrinsics.td            |    5 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   44 +
 .../Target/AArch64/AArch64ISelLowering.cpp    |  182 +--
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |    6 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |    9 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |   10 +-
 llvm/test/CodeGen/AArch64/alias_mask.ll       |  421 +++++++
 .../CodeGen/AArch64/alias_mask_scalable.ll    |   82 ++
 llvm/test/CodeGen/AArch64/whilewr.ll          | 1086 -----------------
 11 files changed, 714 insertions(+), 1216 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/alias_mask.ll
 create mode 100644 llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/whilewr.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 9f4c90ba82a419..c9589d5af8ebbe 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -23475,6 +23475,86 @@ Examples:
       %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %elem0, i64 429)
       %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
 
+.. _int_experimental_get_alias_lane_mask:
+
+'``llvm.get.alias.lane.mask.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x i1> @llvm.experimental.get.alias.lane.mask.v4i1.i64(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
+      declare <8 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
+      declare <16 x i1> @llvm.experimental.get.alias.lane.mask.v16i1.i64(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
+      declare <vscale x 16 x i1> @llvm.experimental.get.alias.lane.mask.nxv16i1.i64(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
+
+
+Overview:
+"""""""""
+
+Create a mask representing lanes that do or not overlap between two pointers across one vector loop iteration.
+
+
+Arguments:
+""""""""""
+
+The first two arguments have the same scalar integer type.
+The final two are immediates and the result is a vector with the i1 element type.
+
+Semantics:
+""""""""""
+
+In the case that ``%writeAfterRead`` is true, the '``llvm.experimental.get.alias.lane.mask.*``' intrinsics are semantically equivalent
+to:
+
+::
+
+      %diff = (%ptrB - %ptrA) / %elementSize
+      %m[i] = (icmp ult i, %diff) || (%diff <= 0)
+
+Otherwise they are semantically equivalent to:
+
+::
+
+      %diff = abs(%ptrB - %ptrA) / %elementSize
+      %m[i] = (icmp ult i, %diff) || (%diff == 0)
+
+where ``%m`` is a vector (mask) of active/inactive lanes with its elements
+indexed by ``i``,  and ``%ptrA``, ``%ptrB`` are the two i64 arguments to
+``llvm.experimental.get.alias.lane.mask.*``, ``%elementSize`` is the i32 argument, ``%abs`` is the absolute difference operation, ``%icmp`` is an integer compare and ``ult``
+the unsigned less-than comparison operator. The subtraction between ``%ptrA`` and ``%ptrB`` could be negative. The ``%writeAfterRead`` argument is expected to be true if the ``%ptrB`` is stored to after ``%ptrA`` is read from.
+The above is equivalent to:
+
+::
+
+      %m = @llvm.experimental.get.alias.lane.mask(%ptrA, %ptrB, %elementSize, %writeAfterRead)
+
+This can, for example, be emitted by the loop vectorizer in which case
+``%ptrA`` is a pointer that is read from within the loop, and ``%ptrB`` is a pointer that is stored to within the loop.
+If the difference between these pointers is less than the vector factor, then they overlap (alias) within a loop iteration.
+An example is if ``%ptrA`` is 20 and ``%ptrB`` is 23 with a vector factor of 8, then lanes 3, 4, 5, 6 and 7 of the vector loaded from ``%ptrA``
+share addresses with lanes 0, 1, 2, 3, 4 and 5 from the vector stored to at ``%ptrB``.
+An alias mask of these two pointers should be <1, 1, 1, 0, 0, 0, 0, 0> so that only the non-overlapping lanes are loaded and stored.
+This operation allows many loops to be vectorised when it would otherwise be unsafe to do so.
+
+To account for the fact that only a subset of lanes have been operated on in an iteration,
+the loop's induction variable should be incremented by the popcount of the mask rather than the vector factor.
+
+This mask ``%m`` can e.g. be used in masked load/store instructions.
+
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %alias.lane.mask = call <4 x i1> @llvm.experimental.get.alias.lane.mask.v4i1.i64(i64 %ptrA, i64 %ptrB, i32 4, i1 1)
+      %vecA = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptrA, i32 4, <4 x i1> %alias.lane.mask, <4 x i32> poison)
+      [...]
+      call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, <4 x i32>* %ptrB, i32 4, <4 x i1> %alias.lane.mask)
 
 .. _int_experimental_vp_splice:
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 6a41094ff933b0..0338310fd936df 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -468,6 +468,11 @@ class TargetLoweringBase {
     return true;
   }
 
+  /// Return true if the @llvm.experimental.get.alias.lane.mask intrinsic should be expanded using generic code in SelectionDAGBuilder.
+  virtual bool shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT, unsigned EltSize) const {
+    return true;
+  }
+
   virtual bool shouldExpandGetVectorLength(EVT CountVT, unsigned VF,
                                            bool IsScalable) const {
     return true;
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1ca8c2565ab0b6..5f7073a531283e 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2363,6 +2363,11 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<1>>
                                     llvm_i32_ty]>;
 }
 
+def int_experimental_get_alias_lane_mask:
+  DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+            [llvm_anyint_ty, LLVMMatchType<1>, llvm_anyint_ty, llvm_i1_ty],
+            [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+
 def int_get_active_lane_mask:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
             [llvm_anyint_ty, LLVMMatchType<1>],
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 9d729d448502d8..39e84e06a8de60 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8284,6 +8284,50 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     visitVectorExtractLastActive(I, Intrinsic);
     return;
   }
+  case Intrinsic::experimental_get_alias_lane_mask: {
+    SDValue SourceValue = getValue(I.getOperand(0));
+    SDValue SinkValue = getValue(I.getOperand(1));
+    SDValue EltSize = getValue(I.getOperand(2));
+    bool IsWriteAfterRead = cast<ConstantSDNode>(getValue(I.getOperand(3)))->getZExtValue() != 0;
+    auto IntrinsicVT = EVT::getEVT(I.getType());
+    auto PtrVT = SourceValue->getValueType(0);
+
+    if (!TLI.shouldExpandGetAliasLaneMask(IntrinsicVT, PtrVT, cast<ConstantSDNode>(EltSize)->getSExtValue())) {
+      visitTargetIntrinsic(I, Intrinsic);
+      return;
+    }
+
+    SDValue Diff = DAG.getNode(ISD::SUB, sdl,
+                              PtrVT, SinkValue, SourceValue);
+    if (!IsWriteAfterRead)
+      Diff = DAG.getNode(ISD::ABS, sdl, PtrVT, Diff);
+
+    Diff = DAG.getNode(ISD::SDIV, sdl, PtrVT, Diff, EltSize);
+    SDValue Zero = DAG.getTargetConstant(0, sdl, PtrVT);
+
+    // If the difference is positive then some elements may alias
+    auto CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                   PtrVT);
+    SDValue Cmp = DAG.getSetCC(sdl, CmpVT, Diff, Zero, IsWriteAfterRead ? ISD::SETLE : ISD::SETEQ);
+
+    // Splat the compare result then OR it with a lane mask
+    SDValue Splat = DAG.getSplat(IntrinsicVT, sdl, Cmp);
+
+    SDValue DiffMask;
+    // Don't emit an active lane mask if the target doesn't support it
+    if (TLI.shouldExpandGetActiveLaneMask(IntrinsicVT, PtrVT)) {
+        EVT VecTy = EVT::getVectorVT(*DAG.getContext(), PtrVT,
+                                     IntrinsicVT.getVectorElementCount());
+        SDValue DiffSplat = DAG.getSplat(VecTy, sdl, Diff);
+        SDValue VectorStep = DAG.getStepVector(sdl, VecTy);
+        DiffMask = DAG.getSetCC(sdl, IntrinsicVT, VectorStep,
+                                     DiffSplat, ISD::CondCode::SETULT);
+    } else {
+      DiffMask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, IntrinsicVT, DAG.getTargetConstant(Intrinsic::get_active_lane_mask, sdl, MVT::i64), Zero, Diff);
+    }
+    SDValue Or = DAG.getNode(ISD::OR, sdl, IntrinsicVT, DiffMask, Splat);
+    setValue(&I, Or);
+  }
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7ab3fc06715ec8..66eaec0d5ae6c9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2033,6 +2033,24 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
   return false;
 }
 
+bool AArch64TargetLowering::shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT, unsigned EltSize) const {
+  if (!Subtarget->hasSVE2())
+    return true;
+
+  if (PtrVT != MVT::i64)
+    return true;
+
+  if (VT == MVT::v2i1 || VT == MVT::nxv2i1)
+    return EltSize != 8;
+  if( VT == MVT::v4i1 || VT == MVT::nxv4i1)
+    return EltSize != 4;
+  if (VT == MVT::v8i1 || VT == MVT::nxv8i1)
+    return EltSize != 2;
+  if (VT == MVT::v16i1 || VT == MVT::nxv16i1)
+    return EltSize != 1;
+  return true;
+}
+
 bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
     const IntrinsicInst *I) const {
   if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
@@ -2796,6 +2814,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::LS64_BUILD)
     MAKE_CASE(AArch64ISD::LS64_EXTRACT)
     MAKE_CASE(AArch64ISD::TBL)
+    MAKE_CASE(AArch64ISD::WHILEWR)
+    MAKE_CASE(AArch64ISD::WHILERW)
     MAKE_CASE(AArch64ISD::FADD_PRED)
     MAKE_CASE(AArch64ISD::FADDA_PRED)
     MAKE_CASE(AArch64ISD::FADDV_PRED)
@@ -5881,6 +5901,16 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
   }
+  case Intrinsic::aarch64_sve_whilewr_b:
+  case Intrinsic::aarch64_sve_whilewr_h:
+  case Intrinsic::aarch64_sve_whilewr_s:
+  case Intrinsic::aarch64_sve_whilewr_d:
+    return DAG.getNode(AArch64ISD::WHILEWR, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::aarch64_sve_whilerw_b:
+  case Intrinsic::aarch64_sve_whilerw_h:
+  case Intrinsic::aarch64_sve_whilerw_s:
+  case Intrinsic::aarch64_sve_whilerw_d:
+    return DAG.getNode(AArch64ISD::WHILERW, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::aarch64_neon_abs: {
     EVT Ty = Op.getValueType();
     if (Ty == MVT::i64) {
@@ -6340,16 +6370,39 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AArch64ISD::USDOT, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   }
+  case Intrinsic::experimental_get_alias_lane_mask:
   case Intrinsic::get_active_lane_mask: {
+    unsigned IntrinsicID = Intrinsic::aarch64_sve_whilelo;
+    if (IntNo == Intrinsic::experimental_get_alias_lane_mask) {
+      uint64_t EltSize = Op.getOperand(3)->getAsZExtVal();
+      bool IsWriteAfterRead = Op.getOperand(4)->getAsZExtVal() == 1;
+      switch (EltSize) {
+        case 1:
+          IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_b : Intrinsic::aarch64_sve_whilerw_b;
+          break;
+        case 2:
+          IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_h : Intrinsic::aarch64_sve_whilerw_h;
+          break;
+        case 4:
+          IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_s : Intrinsic::aarch64_sve_whilerw_s;
+          break;
+        case 8:
+          IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_d : Intrinsic::aarch64_sve_whilerw_d;
+          break;
+        default:
+          llvm_unreachable("Unexpected element size for get.alias.lane.mask");
+          break;
+      }
+    }
     SDValue ID =
-        DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
+        DAG.getTargetConstant(IntrinsicID, dl, MVT::i64);
 
     EVT VT = Op.getValueType();
     if (VT.isScalableVector())
       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
                          Op.getOperand(2));
 
-    // We can use the SVE whilelo instruction to lower this intrinsic by
+    // We can use the SVE whilelo/whilewr/whilerw instruction to lower this intrinsic by
     // creating the appropriate sequence of scalable vector operations and
     // then extracting a fixed-width subvector from the scalable vector.
 
@@ -14241,128 +14294,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
   return ResultSLI;
 }
 
-/// Try to lower the construction of a pointer alias mask to a WHILEWR.
-/// The mask's enabled lanes represent the elements that will not overlap across
-/// one loop iteration. This tries to match:
-/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))),
-///    (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size))
-SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
-                         const AArch64Subtarget &Subtarget) {
-  if (!Subtarget.hasSVE2())
-    return SDValue();
-  SDValue LaneMask = Op.getOperand(0);
-  SDValue Splat = Op.getOperand(1);
-
-  if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
-    std::swap(LaneMask, Splat);
-
-  if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
-      LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask ||
-      Splat.getOpcode() != ISD::SPLAT_VECTOR)
-    return SDValue();
-
-  SDValue Cmp = Splat.getOperand(0);
-  if (Cmp.getOpcode() != ISD::SETCC)
-    return SDValue();
-
-  CondCodeSDNode *Cond = cast<CondCodeSDNode>(Cmp.getOperand(2));
-
-  auto ComparatorConst = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
-  if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 ||
-      Cond->get() != ISD::CondCode::SETLT)
-    return SDValue();
-  unsigned CompValue = std::abs(ComparatorConst->getSExtValue());
-  unsigned EltSize = CompValue + 1;
-  if (!isPowerOf2_64(EltSize) || EltSize > 8)
-    return SDValue();
-
-  SDValue Diff = Cmp.getOperand(0);
-  if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64)
-    return SDValue();
-
-  if (!isNullConstant(LaneMask.getOperand(1)) ||
-      (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA))
-    return SDValue();
-
-  // The number of elements that alias is calculated by dividing the positive
-  // difference between the pointers by the element size. An alias mask for i8
-  // elements omits the division because it would just divide by 1
-  if (EltSize > 1) {
-    SDValue DiffDiv = LaneMask.getOperand(2);
-    auto DiffDivConst = dyn_cast<ConstantSDNode>(DiffDiv.getOperand(1));
-    if (!DiffDivConst || DiffDivConst->getZExtValue() != Log2_64(EltSize))
-      return SDValue();
-    if (EltSize > 2) {
-      // When masking i32 or i64 elements, the positive value of the
-      // possibly-negative difference comes from a select of the difference if
-      // it's positive, otherwise the difference plus the element size if it's
-      // negative: pos_diff = diff < 0 ? (diff + 7) : diff
-      SDValue Select = DiffDiv.getOperand(0);
-      // Make sure the difference is being compared by the select
-      if (Select.getOpcode() != ISD::SELECT_CC || Select.getOperand(3) != Diff)
-        return SDValue();
-      // Make sure it's checking if the difference is less than 0
-      if (!isNullConstant(Select.getOperand(1)) ||
-          cast<CondCodeSDNode>(Select.getOperand(4))->get() !=
-              ISD::CondCode::SETLT)
-        return SDValue();
-      // An add creates a positive value from the negative difference
-      SDValue Add = Select.getOperand(2);
-      if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff)
-        return SDValue();
-      if (auto *AddConst = dyn_cast<ConstantSDNode>(Add.getOperand(1));
-          !AddConst || AddConst->getZExtValue() != EltSize - 1)
-        return SDValue();
-    } else {
-      // When masking i16 elements, this positive value comes from adding the
-      // difference's sign bit to the difference itself. This is equivalent to
-      // the 32 bit and 64 bit case: pos_diff = diff + sign_bit (diff)
-      SDValue Add = DiffDiv.getOperand(0);
-      if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff)
-        return SDValue();
-      // A logical right shift by 63 extracts the sign bit from the difference
-      SDValue Shift = Add.getOperand(1);
-      if (Shift.getOpcode() != ISD::SRL || Shift.getOperand(0) != Diff)
-        return SDValue();
-      if (auto *ShiftConst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
-          !ShiftConst || ShiftConst->getZExtValue() != 63)
-        return SDValue();
-    }
-  } else if (LaneMask.getOperand(2) != Diff)
-    return SDValue();
-
-  SDValue StorePtr = Diff.getOperand(0);
-  SDValue ReadPtr = Diff.getOperand(1);
-
-  unsigned IntrinsicID = 0;
-  switch (EltSize) {
-  case 1:
-    IntrinsicID = Intrinsic::aarch64_sve_whilewr_b;
-    break;
-  case 2:
-    IntrinsicID = Intrinsic::aarch64_sve_whilewr_h;
-    break;
-  case 4:
-    IntrinsicID = Intrinsic::aarch64_sve_whilewr_s;
-    break;
-  case 8:
-    IntrinsicID = Intrinsic::aarch64_sve_whilewr_d;
-    break;
-  default:
-    return SDValue();
-  }
-  SDLoc DL(Op);
-  SDValue ID = DAG.getConstant(IntrinsicID, DL, MVT::i32);
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID,
-                     StorePtr, ReadPtr);
-}
-
 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
                                              SelectionDAG &DAG) const {
-  if (SDValue SV =
-          tryWhileWRFromOR(Op, DAG, DAG.getSubtarget<AArch64Subtarget>()))
-    return SV;
-
   if (useSVEForFixedLengthVectorVT(Op.getValueType(),
                                    !Subtarget->isNeonAvailable()))
     return LowerToScalableOp(Op, DAG);
@@ -19609,7 +19542,9 @@ static bool isPredicateCCSettingOp(SDValue N) {
         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
         // get_active_lane_mask is lowered to a whilelo instruction.
-        N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
+        N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask ||
+        // get_alias_lane_mask is lowered to a whilewr/rw instruction.
+        N.getConstantOperandVal(0) == Intrinsic::experimental_get_alias_lane_mask)))
     return true;
 
   return false;
@@ -27175,6 +27110,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
       return;
     }
     case Intrinsic::experimental_vector_match:
+    case Intrinsic::experimental_get_alias_lane_mask:
     case Intrinsic::get_active_lane_mask: {
       if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
         return;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index cb0b9e965277aa..b2f766b22911ff 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -297,6 +297,10 @@ enum NodeType : unsigned {
   SMAXV,
   UMAXV,
 
+  // Alias lane masks
+  WHILEWR,
+  WHILERW,
+
   SADDV_PRED,
   UADDV_PRED,
   SMAXV_PRED,
@@ -980,6 +984,8 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
 
+  bool shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT, unsigned EltSize) const override;
+
   bool
   shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 564fb33758ad57..99b1e0618ab34b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -140,6 +140,11 @@ def AArch64st1q_scatter : SDNode<"AArch64ISD::SST1Q_PRED", SDT_AArch64_SCATTER_V
 // AArch64 SVE/SVE2 - the remaining node definitions
 //
 
+// Alias masks
+def SDT_AArch64Mask : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>, SDTCisSameAs<2, 1>, SDTCVecEltisVT<0,i1>]>;
+def AArch64whilewr : SDNode<"AArch64ISD::WHILEWR", SDT_AArch64Mask>;
+def AArch64whilerw : SDNode<"AArch64ISD::WHILERW", SDT_AArch64Mask>;
+
 // SVE CNT/INC/RDVL
 def sve_rdvl_imm : ComplexPattern<i64, 1, "SelectRDVLImm<-32, 31, 16>">;
 def sve_cnth_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 8>">;
@@ -3913,8 +3918,8 @@ let Predicates = [HasSVE2orSME] in {
   defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi, int_aarch64_sve_whilelo>;
 
   // SVE2 pointer conflict compare
-  defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
-  defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
+  defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", AArch64whilewr>;
+  defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", AArch64whilerw>;
 } // End HasSVE2orSME
 
 let Predicates = [HasSVEAES, HasNonStreamingSVE2orSSVE_AES] in {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 1ddb913f013f5e..a0d5bb1504089d 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5765,16 +5765,16 @@ class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
   let isWhile = 1;
 }
 
-multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
+multiclass sve2_int_while_rr<bits<1> rw, string asm, SDPatternOperator op> {
   def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>;
   def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>;
   def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
   def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;
 
-  def : SVE_2_Op_Pat<nxv16i1, !cast<SDPatternOperator>(op # _b), i64, i64, !cast<Instruction>(NAME # _B)>;
-  def : SVE_2_Op_Pat<nxv8i1,  !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>;
-  def : SVE_2_Op_Pat<nxv4i1,  !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>;
-  def : SVE_2_Op_Pat<nxv2i1,  !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_2_Op_Pat<nxv16i1, op, i64, i64, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv8i1,  op, i64, i64, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i1,  op, i64, i64, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i1,  op, i64, i64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll
new file mode 100644
index 00000000000000..59ad2bf82db92e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/alias_mask.ll
@@ -0,0 +1,421 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: llc -mtriple=aarch64 %s | FileCheck %s --check-prefix=CHECK-NOSVE
+
+define <16 x i1> @whilewr_8(i64 %a, i64 %b) {
+; CHECK-SVE-LABEL: whilewr_8:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilewr p0.b, x0, x1
+; CHECK-SVE-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-NOSVE-LABEL: whilewr_8:
+; CHECK-NOSVE:       // %bb.0: // %entry
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NOSVE-NEXT:    adrp x10, .LCPI0_1
+; CHECK-NOSVE-NEXT:    sub x9, x1, x0
+; CHECK-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI0_0]
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI0_2
+; CHECK-NOSVE-NEXT:    ldr q1, [x10, :lo12:.LCPI0_1]
+; CHECK-NOSVE-NEXT:    ldr q3, [x8, :lo12:.LCPI0_2]
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI0_4
+; CHECK-NOSVE-NEXT:    adrp x10, .LCPI0_3
+; CHECK-NOSVE-NEXT:    ldr q5, [x8, :lo12:.LCPI0_4]
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI0_5
+; CHECK-NOSVE-NEXT:    dup v2.2d, x9
+; CHECK-NOSVE-NEXT:    ldr q4, [x10, :lo12:.LCPI0_3]
+; CHECK-NOSVE-NEXT:    adrp x10, .LCPI0_6
+; CHECK-NOSVE-NEXT:    ldr q6, [x8, :lo12:.LCPI0_5]
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI0_7
+; CHECK-NOSVE-NEXT:    ldr q7, [x10, :lo12:.LCPI0_6]
+; CHECK-NOSVE-NEXT:    cmp x9, #1
+; CHECK-NOSVE-NEXT:    ldr q16, [x8, :lo12:.LCPI0_7]
+; CHECK-NOSVE-NEXT:    cmhi v0.2d, v2.2d, v0.2d
+; CHECK-NOSVE-NEXT:    cmhi v1.2d, v2.2d, v1.2d
+; CHECK-NOSVE-NEXT:    cmhi v3.2d, v2.2d, v3.2d
+; CHECK-NOSVE-NEXT:    cmhi v4.2d, v2.2d, v4.2d
+; CHECK-NOSVE-NEXT:    cmhi v5.2d, v2.2d, v5.2d
+; CHECK-NOSVE-NEXT:    cmhi v6.2d, v2.2d, v6.2d
+; CHECK-NOSVE-NEXT:    cmhi v7.2d, v2.2d, v7.2d
+; CHECK-NOSVE-NEXT:    cmhi v2.2d, v2.2d, v16.2d
+; CHECK-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; CHECK-NOSVE-NEXT:    cset w8, lt
+; CHECK-NOSVE-NEXT:    uzp1 v1.4s, v4.4s, v3.4s
+; CHECK-NOSVE-NEXT:    uzp1 v3.4s, v6.4s, v5.4s
+; CHECK-NOSVE-NEXT:    uzp1 v2.4s, v2.4s, v7.4s
+; CHECK-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-NOSVE-NEXT:    dup v1.16b, w8
+; CHECK-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NOSVE-NEXT:    ret
+entry:
+  %0 = call <16 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 1, i1 1)
+  ret <16 x i1> %0
+}
+
+define <8 x i1> @whilewr_16(i64 %a, i64 %b) {
+; CHECK-SVE-LABEL: whilewr_16:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilewr p0.b, x0, x1
+; CHECK-SVE-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-NOSVE-LABEL: whilewr_16:
+; CHECK-NOSVE:       // %bb.0: // %entry
+; CHECK-NOSVE-NEXT:    sub x8, x1, x0
+; CHECK-NOSVE-NEXT:    adrp x9, .LCPI1_0
+; CHECK-NOSVE-NEXT:    adrp x10, .LCPI1_1
+; CHECK-NOSVE-NEXT:    add x8, x8, x8, lsr #63
+; CHECK-NOSVE-NEXT:    adrp x11, .LCPI1_2
+; CHECK-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI1_0]
+; CHECK-NOSVE-NEXT:    adrp x9, .LCPI1_3
+; CHECK-NOSVE-NEXT:    ldr q2, [x10, :lo12:.LCPI1_1]
+; CHECK-NOSVE-NEXT:    ldr q3, [x11, :lo12:.LCPI1_2]
+; CHECK-NOSVE-NEXT:    asr x8, x8, #1
+; CHECK-NOSVE-NEXT:    ldr q4, [x9, :lo12:.LCPI1_3]
+; CHECK-NOSVE-NEXT:    dup v0.2d, x8
+; CHECK-NOSVE-NEXT:    cmp x8, #1
+; CHECK-NOSVE-NEXT:    cset w8, lt
+; CHECK-NOSVE-NEXT:    cmhi v1.2d, v0.2d, v1.2d
+; CHECK-NOSVE-NEXT:    cmhi v2.2d, v0.2d, v2.2d
+; CHECK-NOSVE-NEXT:    cmhi v3.2d, v0.2d, v3.2d
+; CHECK-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v4.2d
+; CHECK-NOSVE-NEXT:    uzp1 v1.4s, v2.4s, v1.4s
+; CHECK-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v3.4s
+; CHECK-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NOSVE-NEXT:    dup v1.8b, w8
+; CHECK-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NOSVE-NEXT:    ret
+entry:
+  %0 = call <8 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 2, i1 1)
+  ret <8 x i1> %0
+}
+
+define <4 x i1> @whilewr_32(i64 %a, i64 %b) {
+; CHECK-SVE-LABEL: whilewr_32:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilewr p0.h, x0, x1
+; CHECK-SVE-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-NOSVE-LABEL: whilewr_32:
+; CHECK-NOSVE:       // %bb.0: // %entry
+; CHECK-NOSVE-NEXT:    sub x9, x1, x0
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NOSVE-NEXT:    add x10, x9, #3
+; CHECK-NOSVE-NEXT:    cmp x9, #0
+; CHECK-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
+; CHECK-NOSVE-NEXT:    csel x9, x10, x9, lt
+; CHECK-NOSVE-NEXT:    adrp x10, .LCPI2_1
+; CHECK-NOSVE-NEXT:    asr x9, x9, #2
+; CHECK-NOSVE-NEXT:    ldr q2, [x10, :lo12:.LCPI2_1]
+; CHECK-NOSVE-NEXT:    dup v0.2d, x9
+; CHECK-NOSVE-NEXT:    cmp x9, #1
+; CHECK-NOSVE-NEXT:    cset w8, lt
+; CHECK-NOSVE-NEXT:    cmhi v1.2d, v0.2d, v1.2d
+; CHECK-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v2.2d
+; CHECK-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NOSVE-NEXT:    dup v1.4h, w8
+; CHECK-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NOSVE-NEXT:    ret
+entry:
+  %0 = call <4 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 4, i1 1)
+  ret <4 x i1> %0
+}
+
+define <2 x i1> @whilewr_64(i64 %a, i64 %b) {
+; CHECK-SVE-LABEL: whilewr_64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilewr p0.s, x0, x1
+; CHECK-SVE-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-NOSVE-LABEL: whilewr_64:
+; CHECK-NOSVE:       // %bb.0: // %entry
+; CHECK-NOSVE-NEXT:    sub x9, x1, x0
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NOSVE-NEXT:    add x10, x9, #7
+; CHECK-NOSVE-NEXT:    cmp x9, #0
+; CHECK-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
+; CHECK-NOSVE-NEXT:    csel x9, x10, x9, lt
+; CHECK-NOSVE-NEXT:    asr x9, x9, #3
+; CHECK-NOSVE-NEXT:    dup v0.2d, x9
+; CHECK-NOSVE-NEXT:    cmp x9, #1
+; CHECK-NOSVE-NEXT:    cset w8, lt
+; CHECK-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v1.2d
+; CHECK-NOSVE-NEXT:    dup v1.2s, w8
+; CHECK-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NOSVE-NEXT:    ret
+entry:
+  %0 = call <2 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 8, i1 1)
+  ret <2 x i1> %0
+}
+
+define <16 x i1> @whilerw_8(i64 %a, i64 %b) {
+; CHECK-SVE-LABEL: whilerw_8:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilerw p0.b, x0, x1
+; CHECK-SVE-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-NOSVE-LABEL: whilerw_8:
+; CHECK-NOSVE:       // %bb.0: // %entry
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NOSVE-NEXT:    subs x9, x1, x0
+; CHECK-NOSVE-NEXT:    adrp x10, .LCPI4_1
+; CHECK-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI4_0]
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI4_2
+; CHECK-NOSVE-NEXT:    cneg x9, x9, mi
+; CHECK-NOSVE-NEXT:    ldr q2, [x8, :lo12:.LCPI4_2]
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI4_3
+; CHECK-NOSVE-NEXT:    ldr q1, [x10, :lo12:.LCPI4_1]
+; CHECK-NOSVE-NEXT:    ldr q4, [x8, :lo12:.LCPI4_3]
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI4_4
+; CHECK-NOSVE-NEXT:    dup v3.2d, x9
+; CHECK-NOSVE-NEXT:    ldr q5, [x8, :lo12:.LCPI4_4]
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI4_5
+; CHECK-NOSVE-NEXT:    adrp x10, .LCPI4_6
+; CHECK-NOSVE-NEXT:    ldr q6, [x8, :lo12:.LCPI4_5]
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI4_7
+; CHECK-NOSVE-NEXT:    ldr q7, [x10, :lo12:.LCPI4_6]
+; CHECK-NOSVE-NEXT:    ldr q16, [x8, :lo12:.LCPI4_7]
+; CHECK-NOSVE-NEXT:    cmhi v0.2d, v3.2d, v0.2d
+; CHECK-NOSVE-NEXT:    cmhi v1.2d, v3.2d, v1.2d
+; CHECK-NOSVE-NEXT:    cmhi v2.2d, v3.2d, v2.2d
+; CHECK-NOSVE-NEXT:    cmhi v4.2d, v3.2d, v4.2d
+; CHECK-NOSVE-NEXT:    cmhi v5.2d, v3.2d, v5.2d
+; CHECK-NOSVE-NEXT:    cmhi v6.2d, v3.2d, v6.2d
+; CHECK-NOSVE-NEXT:    cmhi v7.2d, v3.2d, v7.2d
+; CHECK-NOSVE-NEXT:    cmhi v3.2d, v3.2d, v16.2d
+; CHECK-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; CHECK-NOSVE-NEXT:    cmp x9, #0
+; CHECK-NOSVE-NEXT:    uzp1 v1.4s, v4.4s, v2.4s
+; CHECK-NOSVE-NEXT:    cset w8, eq
+; CHECK-NOSVE-NEXT:    uzp1 v2.4s, v6.4s, v5.4s
+; CHECK-NOSVE-NEXT:    uzp1 v3.4s, v3.4s, v7.4s
+; CHECK-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
+; CHECK-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-NOSVE-NEXT:    dup v1.16b, w8
+; CHECK-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NOSVE-NEXT:    ret
+entry:
+  %0 = call <16 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 1, i1 0)
+  ret <16 x i1> %0
+}
+
+define <8 x i1> @whilerw_16(i64 %a, i64 %b) {
+; CHECK-SVE-LABEL: whilerw_16:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilerw p0.b, x0, x1
+; CHECK-SVE-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-NOSVE-LABEL: whilerw_16:
+; CHECK-NOSVE:       // %bb.0: // %entry
+; CHECK-NOSVE-NEXT:    subs x8, x1, x0
+; CHECK-NOSVE-NEXT:    adrp x9, .LCPI5_0
+; CHECK-NOSVE-NEXT:    adrp x10, .LCPI5_1
+; CHECK-NOSVE-NEXT:    cneg x8, x8, mi
+; CHECK-NOSVE-NEXT:    adrp x11, .LCPI5_2
+; CHECK-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI5_0]
+; CHECK-NOSVE-NEXT:    add x8, x8, x8, lsr #63
+; CHECK-NOSVE-NEXT:    adrp x9, .LCPI5_3
+; CHECK-NOSVE-NEXT:    ldr q2, [x10, :lo12:.LCPI5_1]
+; CHECK-NOSVE-NEXT:    ldr q3, [x11, :lo12:.LCPI5_2]
+; CHECK-NOSVE-NEXT:    ldr q4, [x9, :lo12:.LCPI5_3]
+; CHECK-NOSVE-NEXT:    asr x8, x8, #1
+; CHECK-NOSVE-NEXT:    dup v0.2d, x8
+; CHECK-NOSVE-NEXT:    cmp x8, #0
+; CHECK-NOSVE-NEXT:    cset w8, eq
+; CHECK-NOSVE-NEXT:    cmhi v1.2d, v0.2d, v1.2d
+; CHECK-NOSVE-NEXT:    cmhi v2.2d, v0.2d, v2.2d
+; CHECK-NOSVE-NEXT:    cmhi v3.2d, v0.2d, v3.2d
+; CHECK-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v4.2d
+; CHECK-NOSVE-NEXT:    uzp1 v1.4s, v2.4s, v1.4s
+; CHECK-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v3.4s
+; CHECK-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NOSVE-NEXT:    dup v1.8b, w8
+; CHECK-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NOSVE-NEXT:    ret
+entry:
+  %0 = call <8 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 2, i1 0)
+  ret <8 x i1> %0
+}
+
+define <4 x i1> @whilerw_32(i64 %a, i64 %b) {
+; CHECK-SVE-LABEL: whilerw_32:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilerw p0.h, x0, x1
+; CHECK-SVE-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-NOSVE-LABEL: whilerw_32:
+; CHECK-NOSVE:       // %bb.0: // %entry
+; CHECK-NOSVE-NEXT:    subs x9, x1, x0
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI6_0
+; CHECK-NOSVE-NEXT:    cneg x9, x9, mi
+; CHECK-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI6_0]
+; CHECK-NOSVE-NEXT:    add x10, x9, #3
+; CHECK-NOSVE-NEXT:    cmp x9, #0
+; CHECK-NOSVE-NEXT:    csel x9, x10, x9, lt
+; CHECK-NOSVE-NEXT:    adrp x10, .LCPI6_1
+; CHECK-NOSVE-NEXT:    asr x9, x9, #2
+; CHECK-NOSVE-NEXT:    ldr q2, [x10, :lo12:.LCPI6_1]
+; CHECK-NOSVE-NEXT:    dup v0.2d, x9
+; CHECK-NOSVE-NEXT:    cmp x9, #0
+; CHECK-NOSVE-NEXT:    cset w8, eq
+; CHECK-NOSVE-NEXT:    cmhi v1.2d, v0.2d, v1.2d
+; CHECK-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v2.2d
+; CHECK-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NOSVE-NEXT:    dup v1.4h, w8
+; CHECK-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NOSVE-NEXT:    ret
+entry:
+  %0 = call <4 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 4, i1 0)
+  ret <4 x i1> %0
+}
+
+define <2 x i1> @whilerw_64(i64 %a, i64 %b) {
+; CHECK-SVE-LABEL: whilerw_64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilerw p0.s, x0, x1
+; CHECK-SVE-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-NOSVE-LABEL: whilerw_64:
+; CHECK-NOSVE:       // %bb.0: // %entry
+; CHECK-NOSVE-NEXT:    subs x9, x1, x0
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI7_0
+; CHECK-NOSVE-NEXT:    cneg x9, x9, mi
+; CHECK-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
+; CHECK-NOSVE-NEXT:    add x10, x9, #7
+; CHECK-NOSVE-NEXT:    cmp x9, #0
+; CHECK-NOSVE-NEXT:    csel x9, x10, x9, lt
+; CHECK-NOSVE-NEXT:    asr x9, x9, #3
+; CHECK-NOSVE-NEXT:    dup v0.2d, x9
+; CHECK-NOSVE-NEXT:    cmp x9, #0
+; CHECK-NOSVE-NEXT:    cset w8, eq
+; CHECK-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v1.2d
+; CHECK-NOSVE-NEXT:    dup v1.2s, w8
+; CHECK-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NOSVE-NEXT:    ret
+entry:
+  %0 = call <2 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 8, i1 0)
+  ret <2 x i1> %0
+}
+
+define <16 x i1> @not_whilewr_wrong_eltsize(i64 %a, i64 %b) {
+; CHECK-SVE-LABEL: not_whilewr_wrong_eltsize:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    sub x8, x1, x0
+; CHECK-SVE-NEXT:    add x8, x8, x8, lsr #63
+; CHECK-SVE-NEXT:    asr x8, x8, #1
+; CHECK-SVE-NEXT:    cmp x8, #1
+; CHECK-SVE-NEXT:    cset w9, lt
+; CHECK-SVE-NEXT:    whilelo p0.b, #0, x8
+; CHECK-SVE-NEXT:    dup v0.16b, w9
+; CHECK-SVE-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-SVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-NOSVE-LABEL: not_whilewr_wrong_eltsize:
+; CHECK-NOSVE:       // %bb.0: // %entry
+; CHECK-NOSVE-NEXT:    sub x8, x1, x0
+; CHECK-NOSVE-NEXT:    adrp x9, .LCPI8_0
+; CHECK-NOSVE-NEXT:    adrp x10, .LCPI8_1
+; CHECK-NOSVE-NEXT:    add x8, x8, x8, lsr #63
+; CHECK-NOSVE-NEXT:    ldr q0, [x9, :lo12:.LCPI8_0]
+; CHECK-NOSVE-NEXT:    adrp x9, .LCPI8_2
+; CHECK-NOSVE-NEXT:    ldr q2, [x9, :lo12:.LCPI8_2]
+; CHECK-NOSVE-NEXT:    adrp x9, .LCPI8_4
+; CHECK-NOSVE-NEXT:    ldr q1, [x10, :lo12:.LCPI8_1]
+; CHECK-NOSVE-NEXT:    asr x8, x8, #1
+; CHECK-NOSVE-NEXT:    adrp x10, .LCPI8_3
+; CHECK-NOSVE-NEXT:    ldr q5, [x9, :lo12:.LCPI8_4]
+; CHECK-NOSVE-NEXT:    adrp x9, .LCPI8_6
+; CHECK-NOSVE-NEXT:    ldr q3, [x10, :lo12:.LCPI8_3]
+; CHECK-NOSVE-NEXT:    adrp x10, .LCPI8_5
+; CHECK-NOSVE-NEXT:    dup v4.2d, x8
+; CHECK-NOSVE-NEXT:    ldr q7, [x9, :lo12:.LCPI8_6]
+; CHECK-NOSVE-NEXT:    adrp x9, .LCPI8_7
+; CHECK-NOSVE-NEXT:    ldr q6, [x10, :lo12:.LCPI8_5]
+; CHECK-NOSVE-NEXT:    ldr q16, [x9, :lo12:.LCPI8_7]
+; CHECK-NOSVE-NEXT:    cmp x8, #1
+; CHECK-NOSVE-NEXT:    cset w8, lt
+; CHECK-NOSVE-NEXT:    cmhi v0.2d, v4.2d, v0.2d
+; CHECK-NOSVE-NEXT:    cmhi v1.2d, v4.2d, v1.2d
+; CHECK-NOSVE-NEXT:    cmhi v2.2d, v4.2d, v2.2d
+; CHECK-NOSVE-NEXT:    cmhi v3.2d, v4.2d, v3.2d
+; CHECK-NOSVE-NEXT:    cmhi v5.2d, v4.2d, v5.2d
+; CHECK-NOSVE-NEXT:    cmhi v6.2d, v4.2d, v6.2d
+; CHECK-NOSVE-NEXT:    cmhi v7.2d, v4.2d, v7.2d
+; CHECK-NOSVE-NEXT:    cmhi v4.2d, v4.2d, v16.2d
+; CHECK-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; CHECK-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v2.4s
+; CHECK-NOSVE-NEXT:    uzp1 v2.4s, v6.4s, v5.4s
+; CHECK-NOSVE-NEXT:    uzp1 v3.4s, v4.4s, v7.4s
+; CHECK-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
+; CHECK-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-NOSVE-NEXT:    dup v1.16b, w8
+; CHECK-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NOSVE-NEXT:    ret
+entry:
+  %0 = call <16 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 2, i1 1)
+  ret <16 x i1> %0
+}
+
+define <2 x i1> @not_whilerw_ptr32(i32 %a, i32 %b) {
+; CHECK-SVE-LABEL: not_whilerw_ptr32:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    subs w8, w1, w0
+; CHECK-SVE-NEXT:    cneg w8, w8, mi
+; CHECK-SVE-NEXT:    add w9, w8, #7
+; CHECK-SVE-NEXT:    cmp w8, #0
+; CHECK-SVE-NEXT:    csel w8, w9, w8, lt
+; CHECK-SVE-NEXT:    asr w8, w8, #3
+; CHECK-SVE-NEXT:    cmp w8, #0
+; CHECK-SVE-NEXT:    cset w9, eq
+; CHECK-SVE-NEXT:    whilelo p0.s, #0, w8
+; CHECK-SVE-NEXT:    dup v0.2s, w9
+; CHECK-SVE-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-SVE-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-NOSVE-LABEL: not_whilerw_ptr32:
+; CHECK-NOSVE:       // %bb.0: // %entry
+; CHECK-NOSVE-NEXT:    subs w9, w1, w0
+; CHECK-NOSVE-NEXT:    adrp x8, .LCPI9_0
+; CHECK-NOSVE-NEXT:    cneg w9, w9, mi
+; CHECK-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI9_0]
+; CHECK-NOSVE-NEXT:    add w10, w9, #7
+; CHECK-NOSVE-NEXT:    cmp w9, #0
+; CHECK-NOSVE-NEXT:    csel w9, w10, w9, lt
+; CHECK-NOSVE-NEXT:    asr w9, w9, #3
+; CHECK-NOSVE-NEXT:    dup v0.2s, w9
+; CHECK-NOSVE-NEXT:    cmp w9, #0
+; CHECK-NOSVE-NEXT:    cset w8, eq
+; CHECK-NOSVE-NEXT:    dup v2.2s, w8
+; CHECK-NOSVE-NEXT:    cmhi v0.2s, v0.2s, v1.2s
+; CHECK-NOSVE-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-NOSVE-NEXT:    ret
+entry:
+  %0 = call <2 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i32.i32(i32 %a, i32 %b, i32 8, i1 0)
+  ret <2 x i1> %0
+}
diff --git a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
new file mode 100644
index 00000000000000..4912bc0f59d40d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s | FileCheck %s
+
+define <vscale x 16 x i1> @whilewr_8(i64 %a, i64 %b) {
+; CHECK-LABEL: whilewr_8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilewr p0.b, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 1, i1 1)
+  ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 8 x i1> @whilewr_16(i64 %a, i64 %b) {
+; CHECK-LABEL: whilewr_16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilewr p0.h, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 8 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 2, i1 1)
+  ret <vscale x 8 x i1> %0
+}
+
+define <vscale x 4 x i1> @whilewr_32(i64 %a, i64 %b) {
+; CHECK-LABEL: whilewr_32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilewr p0.s, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 4 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 4, i1 1)
+  ret <vscale x 4 x i1> %0
+}
+
+define <vscale x 2 x i1> @whilewr_64(i64 %a, i64 %b) {
+; CHECK-LABEL: whilewr_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilewr p0.d, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 2 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 8, i1 1)
+  ret <vscale x 2 x i1> %0
+}
+
+define <vscale x 16 x i1> @whilerw_8(i64 %a, i64 %b) {
+; CHECK-LABEL: whilerw_8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilerw p0.b, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 1, i1 0)
+  ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 8 x i1> @whilerw_16(i64 %a, i64 %b) {
+; CHECK-LABEL: whilerw_16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilerw p0.h, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 8 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 2, i1 0)
+  ret <vscale x 8 x i1> %0
+}
+
+define <vscale x 4 x i1> @whilerw_32(i64 %a, i64 %b) {
+; CHECK-LABEL: whilerw_32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilerw p0.s, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 4 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 4, i1 0)
+  ret <vscale x 4 x i1> %0
+}
+
+define <vscale x 2 x i1> @whilerw_64(i64 %a, i64 %b) {
+; CHECK-LABEL: whilerw_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilerw p0.d, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 2 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %a, i64 %b, i64 8, i1 0)
+  ret <vscale x 2 x i1> %0
+}
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
deleted file mode 100644
index 9f1ea850792384..00000000000000
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ /dev/null
@@ -1,1086 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2
-
-define <vscale x 16 x i1> @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.b, x1, x2
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_8:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
-; CHECK-NOSVE2-NEXT:    cset w9, lt
-; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x8
-; CHECK-NOSVE2-NEXT:    sbfx x8, x9, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x8
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %c14 = ptrtoint ptr %c to i64
-  %b15 = ptrtoint ptr %b to i64
-  %sub.diff = sub i64 %b15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
-  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
-  %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
-  ret <vscale x 16 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 16 x i1> @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_commutative:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.b, x1, x2
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_commutative:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
-; CHECK-NOSVE2-NEXT:    cset w9, lt
-; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x8
-; CHECK-NOSVE2-NEXT:    sbfx x8, x9, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x8
-; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %c14 = ptrtoint ptr %c to i64
-  %b15 = ptrtoint ptr %b to i64
-  %sub.diff = sub i64 %b15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
-  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
-  %active.lane.mask.alias = or <vscale x 16 x i1> %.splat, %ptr.diff.lane.mask
-  ret <vscale x 16 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 8 x i1> @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.h, x1, x2
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_16:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    cmn x8, #1
-; CHECK-NOSVE2-NEXT:    add x8, x8, x8, lsr #63
-; CHECK-NOSVE2-NEXT:    cset w9, lt
-; CHECK-NOSVE2-NEXT:    sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT:    asr x8, x8, #1
-; CHECK-NOSVE2-NEXT:    whilelo p0.h, xzr, x9
-; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x8
-; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %b14 = ptrtoint ptr %b to i64
-  %c15 = ptrtoint ptr %c to i64
-  %sub.diff = sub i64 %b14, %c15
-  %diff = sdiv i64 %sub.diff, 2
-  %neg.compare = icmp slt i64 %sub.diff, -1
-  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
-  ret <vscale x 8 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.s, x1, x2
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_32:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    add x9, x8, #3
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
-; CHECK-NOSVE2-NEXT:    csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT:    cmn x8, #3
-; CHECK-NOSVE2-NEXT:    cset w8, lt
-; CHECK-NOSVE2-NEXT:    asr x9, x9, #2
-; CHECK-NOSVE2-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x9
-; CHECK-NOSVE2-NEXT:    whilelo p0.s, xzr, x8
-; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %b12 = ptrtoint ptr %b to i64
-  %c13 = ptrtoint ptr %c to i64
-  %sub.diff = sub i64 %b12, %c13
-  %diff = sdiv i64 %sub.diff, 4
-  %neg.compare = icmp slt i64 %sub.diff, -3
-  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
-  ret <vscale x 4 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.d, x1, x2
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_64:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    add x9, x8, #7
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
-; CHECK-NOSVE2-NEXT:    csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT:    cmn x8, #7
-; CHECK-NOSVE2-NEXT:    cset w8, lt
-; CHECK-NOSVE2-NEXT:    asr x9, x9, #3
-; CHECK-NOSVE2-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x9
-; CHECK-NOSVE2-NEXT:    whilelo p0.d, xzr, x8
-; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %b12 = ptrtoint ptr %b to i64
-  %c13 = ptrtoint ptr %c to i64
-  %sub.diff = sub i64 %b12, %c13
-  %diff = sdiv i64 %sub.diff, 8
-  %neg.compare = icmp slt i64 %sub.diff, -7
-  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
-  ret <vscale x 2 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: no_whilewr_128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x8, x1, x2
-; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    add x9, x8, #15
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    csel x9, x9, x8, lt
-; CHECK-NEXT:    cmn x8, #15
-; CHECK-NEXT:    asr x9, x9, #4
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    mov z1.d, x9
-; CHECK-NEXT:    whilelo p1.d, xzr, x8
-; CHECK-NEXT:    cmphi p0.d, p0/z, z1.d, z0.d
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: no_whilewr_128:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    index z0.d, #0, #1
-; CHECK-NOSVE2-NEXT:    ptrue p0.d
-; CHECK-NOSVE2-NEXT:    add x9, x8, #15
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
-; CHECK-NOSVE2-NEXT:    csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT:    cmn x8, #15
-; CHECK-NOSVE2-NEXT:    asr x9, x9, #4
-; CHECK-NOSVE2-NEXT:    cset w8, lt
-; CHECK-NOSVE2-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT:    mov z1.d, x9
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x8
-; CHECK-NOSVE2-NEXT:    cmphi p0.d, p0/z, z1.d, z0.d
-; CHECK-NOSVE2-NEXT:    punpklo p1.h, p1.b
-; CHECK-NOSVE2-NEXT:    punpklo p0.h, p0.b
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %b12 = ptrtoint ptr %b to i64
-  %c13 = ptrtoint ptr %c to i64
-  %sub.diff = sub i64 %b12, %c13
-  %diff = sdiv i64 %sub.diff, 16
-  %neg.compare = icmp slt i64 %sub.diff, -15
-  %.splatinsert = insertelement <vscale x 1 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 1 x i1> %.splatinsert, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 1 x i1> %ptr.diff.lane.mask, %.splat
-  ret <vscale x 1 x i1> %active.lane.mask.alias
-}
-
-define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB6_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.b, x1, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilelo p1.b, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.b
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB6_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NEXT:    add z0.b, z1.b, z0.b
-; CHECK-NEXT:    st1b { z0.b }, p1, [x2, x8]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.b, x8, x9
-; CHECK-NEXT:    b.mi .LBB6_2
-; CHECK-NEXT:  .LBB6_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_8:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB6_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x9
-; CHECK-NOSVE2-NEXT:    sbfx x9, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x9
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x9
-; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.b
-; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB6_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NOSVE2-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NOSVE2-NEXT:    add z0.b, z1.b, z0.b
-; CHECK-NOSVE2-NEXT:    st1b { z0.b }, p1, [x2, x8]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB6_2
-; CHECK-NOSVE2-NEXT:  .LBB6_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp11 = icmp sgt i32 %n, 0
-  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c14 = ptrtoint ptr %c to i64
-  %b15 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %b15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
-  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
-  %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
-  %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
-  %0 = zext <vscale x 16 x i1> %active.lane.mask.alias to <vscale x 16 x i8>
-  %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %0)
-  %2 = zext i8 %1 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = and <vscale x 16 x i1> %active.lane.mask, %active.lane.mask.alias
-  %4 = getelementptr inbounds i8, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
-  %5 = getelementptr inbounds i8, ptr %b, i64 %index
-  %wide.masked.load16 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
-  %6 = add <vscale x 16 x i8> %wide.masked.load16, %wide.masked.load
-  %7 = getelementptr inbounds i8, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %6, ptr %7, i32 1, <vscale x 16 x i1> %3)
-  %index.next = add i64 %index, %2
-  %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
-  br i1 %8, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB7_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w8, w3
-; CHECK-NEXT:    whilewr p1.h, x1, x2
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    whilelo p0.h, xzr, x8
-; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:  .LBB7_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
-; CHECK-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x2, x9, lsl #1]
-; CHECK-NEXT:    inch x9
-; CHECK-NEXT:    whilelo p0.h, x9, x8
-; CHECK-NEXT:    b.mi .LBB7_2
-; CHECK-NEXT:  .LBB7_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_16:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB7_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    whilelo p0.h, xzr, x9
-; CHECK-NOSVE2-NEXT:    cmn x10, #1
-; CHECK-NOSVE2-NEXT:    add x10, x10, x10, lsr #63
-; CHECK-NOSVE2-NEXT:    cset w11, lt
-; CHECK-NOSVE2-NEXT:    sbfx x11, x11, #0, #1
-; CHECK-NOSVE2-NEXT:    asr x10, x10, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x11
-; CHECK-NOSVE2-NEXT:    whilelo p2.h, xzr, x10
-; CHECK-NOSVE2-NEXT:    cnth x10
-; CHECK-NOSVE2-NEXT:    mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:  .LBB7_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NOSVE2-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p0.h, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB7_2
-; CHECK-NOSVE2-NEXT:  .LBB7_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp11 = icmp sgt i32 %n, 0
-  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %b14 = ptrtoint ptr %b to i64
-  %c15 = ptrtoint ptr %c to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 3
-  %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
-  %sub.diff = sub i64 %b14, %c15
-  %diff = sdiv i64 %sub.diff, 2
-  %neg.compare = icmp slt i64 %sub.diff, -1
-  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
-  %2 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.entry
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 8 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = getelementptr inbounds i16, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
-  %4 = getelementptr inbounds i16, ptr %b, i64 %index
-  %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
-  %5 = add <vscale x 8 x i16> %wide.masked.load16, %wide.masked.load
-  %6 = getelementptr inbounds i16, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %5, ptr %6, i32 2, <vscale x 8 x i1> %active.lane.mask)
-  %index.next = add i64 %index, %1
-  %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %7 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
-  br i1 %7, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB8_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w8, w3
-; CHECK-NEXT:    whilewr p1.s, x1, x2
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    whilelo p0.s, xzr, x8
-; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:  .LBB8_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
-; CHECK-NEXT:    add z0.s, z1.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x2, x9, lsl #2]
-; CHECK-NEXT:    incw x9
-; CHECK-NEXT:    whilelo p0.s, x9, x8
-; CHECK-NEXT:    b.mi .LBB8_2
-; CHECK-NEXT:  .LBB8_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_32:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB8_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    whilelo p0.s, xzr, x9
-; CHECK-NOSVE2-NEXT:    add x11, x10, #3
-; CHECK-NOSVE2-NEXT:    cmp x10, #0
-; CHECK-NOSVE2-NEXT:    csel x11, x11, x10, lt
-; CHECK-NOSVE2-NEXT:    cmn x10, #3
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    asr x11, x11, #2
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p2.s, xzr, x11
-; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x10
-; CHECK-NOSVE2-NEXT:    cntw x10
-; CHECK-NOSVE2-NEXT:    mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:  .LBB8_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    add z0.s, z1.s, z0.s
-; CHECK-NOSVE2-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p0.s, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB8_2
-; CHECK-NOSVE2-NEXT:  .LBB8_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp9 = icmp sgt i32 %n, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %b12 = ptrtoint ptr %b to i64
-  %c13 = ptrtoint ptr %c to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 2
-  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
-  %sub.diff = sub i64 %b12, %c13
-  %diff = sdiv i64 %sub.diff, 4
-  %neg.compare = icmp slt i64 %sub.diff, -3
-  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
-  %2 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.entry
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 4 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
-  %4 = getelementptr inbounds i32, ptr %b, i64 %index
-  %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
-  %5 = add <vscale x 4 x i32> %wide.masked.load14, %wide.masked.load
-  %6 = getelementptr inbounds i32, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask)
-  %index.next = add i64 %index, %1
-  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %7 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
-  br i1 %7, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB9_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w8, w3
-; CHECK-NEXT:    whilewr p1.d, x1, x2
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    whilelo p0.d, xzr, x8
-; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:  .LBB9_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x2, x9, lsl #3]
-; CHECK-NEXT:    incd x9
-; CHECK-NEXT:    whilelo p0.d, x9, x8
-; CHECK-NEXT:    b.mi .LBB9_2
-; CHECK-NEXT:  .LBB9_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_64:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB9_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    whilelo p0.d, xzr, x9
-; CHECK-NOSVE2-NEXT:    add x11, x10, #7
-; CHECK-NOSVE2-NEXT:    cmp x10, #0
-; CHECK-NOSVE2-NEXT:    csel x11, x11, x10, lt
-; CHECK-NOSVE2-NEXT:    cmn x10, #7
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    asr x11, x11, #3
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p2.d, xzr, x11
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x10
-; CHECK-NOSVE2-NEXT:    cntd x10
-; CHECK-NOSVE2-NEXT:    mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:  .LBB9_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NOSVE2-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p0.d, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB9_2
-; CHECK-NOSVE2-NEXT:  .LBB9_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp9 = icmp sgt i32 %n, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %b12 = ptrtoint ptr %b to i64
-  %c13 = ptrtoint ptr %c to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 1
-  %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
-  %sub.diff = sub i64 %b12, %c13
-  %diff = sdiv i64 %sub.diff, 8
-  %neg.compare = icmp slt i64 %sub.diff, -7
-  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
-  %2 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.entry
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 2 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
-  %4 = getelementptr inbounds i64, ptr %b, i64 %index
-  %wide.masked.load14 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
-  %5 = add <vscale x 2 x i64> %wide.masked.load14, %wide.masked.load
-  %6 = getelementptr inbounds i64, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %5, ptr %6, i32 8, <vscale x 2 x i1> %active.lane.mask)
-  %index.next = add i64 %index, %1
-  %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %7 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
-  br i1 %7, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB10_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.b, x0, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilewr p1.b, x1, x2
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    whilelo p1.b, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.b
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB10_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NEXT:    add z0.b, z1.b, z0.b
-; CHECK-NEXT:    st1b { z0.b }, p1, [x2, x8]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.b, x8, x9
-; CHECK-NEXT:    b.mi .LBB10_2
-; CHECK-NEXT:  .LBB10_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_8:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB10_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x9
-; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x10
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    whilelo p3.b, xzr, x9
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    whilelo p2.b, xzr, x10
-; CHECK-NOSVE2-NEXT:    sel p1.b, p3, p3.b, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x9
-; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.b
-; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB10_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NOSVE2-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NOSVE2-NEXT:    add z0.b, z1.b, z0.b
-; CHECK-NOSVE2-NEXT:    st1b { z0.b }, p1, [x2, x8]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB10_2
-; CHECK-NOSVE2-NEXT:  .LBB10_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp11 = icmp sgt i32 %n, 0
-  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c14 = ptrtoint ptr %c to i64
-  %a15 = ptrtoint ptr %a to i64
-  %b16 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %a15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
-  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
-  %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
-  %sub.diff18 = sub i64 %b16, %c14
-  %neg.compare20 = icmp slt i64 %sub.diff18, 0
-  %.splatinsert21 = insertelement <vscale x 16 x i1> poison, i1 %neg.compare20, i64 0
-  %.splat22 = shufflevector <vscale x 16 x i1> %.splatinsert21, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-  %ptr.diff.lane.mask23 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff18)
-  %active.lane.mask.alias24 = or <vscale x 16 x i1> %ptr.diff.lane.mask23, %.splat22
-  %0 = and <vscale x 16 x i1> %active.lane.mask.alias, %active.lane.mask.alias24
-  %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
-  %1 = zext <vscale x 16 x i1> %0 to <vscale x 16 x i8>
-  %2 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %1)
-  %3 = zext i8 %2 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %4 = and <vscale x 16 x i1> %active.lane.mask, %0
-  %5 = getelementptr inbounds i8, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %4, <vscale x 16 x i8> poison)
-  %6 = getelementptr inbounds i8, ptr %b, i64 %index
-  %wide.masked.load25 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %6, i32 1, <vscale x 16 x i1> %4, <vscale x 16 x i8> poison)
-  %7 = add <vscale x 16 x i8> %wide.masked.load25, %wide.masked.load
-  %8 = getelementptr inbounds i8, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %7, ptr %8, i32 1, <vscale x 16 x i1> %4)
-  %index.next = add i64 %index, %3
-  %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %9 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
-  br i1 %9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB11_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.h, x0, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilewr p1.h, x1, x2
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    whilelo p1.h, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.h
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB11_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
-; CHECK-NEXT:    ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
-; CHECK-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p1, [x2, x8, lsl #1]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.h, x8, x9
-; CHECK-NEXT:    b.mi .LBB11_2
-; CHECK-NEXT:  .LBB11_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_16:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB11_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    cmn x9, #1
-; CHECK-NOSVE2-NEXT:    add x9, x9, x9, lsr #63
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    asr x9, x9, #1
-; CHECK-NOSVE2-NEXT:    whilelo p0.h, xzr, x10
-; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
-; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x9
-; CHECK-NOSVE2-NEXT:    add x9, x10, x10, lsr #63
-; CHECK-NOSVE2-NEXT:    cmn x10, #1
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    asr x9, x9, #1
-; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p3.h, xzr, x9
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    whilelo p2.h, xzr, x10
-; CHECK-NOSVE2-NEXT:    sel p1.b, p3, p3.b, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x9
-; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.h
-; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB11_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NOSVE2-NEXT:    st1h { z0.h }, p1, [x2, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p1.h, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB11_2
-; CHECK-NOSVE2-NEXT:  .LBB11_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp11 = icmp sgt i32 %n, 0
-  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c14 = ptrtoint ptr %c to i64
-  %a15 = ptrtoint ptr %a to i64
-  %b16 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %a15, %c14
-  %diff = sdiv i64 %sub.diff, 2
-  %neg.compare = icmp slt i64 %sub.diff, -1
-  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
-  %sub.diff18 = sub i64 %b16, %c14
-  %diff19 = sdiv i64 %sub.diff18, 2
-  %neg.compare20 = icmp slt i64 %sub.diff18, -1
-  %.splatinsert21 = insertelement <vscale x 8 x i1> poison, i1 %neg.compare20, i64 0
-  %.splat22 = shufflevector <vscale x 8 x i1> %.splatinsert21, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-  %ptr.diff.lane.mask23 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff19)
-  %active.lane.mask.alias24 = or <vscale x 8 x i1> %ptr.diff.lane.mask23, %.splat22
-  %0 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.alias24
-  %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
-  %1 = zext <vscale x 8 x i1> %0 to <vscale x 8 x i8>
-  %2 = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> %1)
-  %3 = zext i8 %2 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 8 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %4 = and <vscale x 8 x i1> %active.lane.mask, %0
-  %5 = getelementptr inbounds i16, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison)
-  %6 = getelementptr inbounds i16, ptr %b, i64 %index
-  %wide.masked.load25 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %6, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison)
-  %7 = add <vscale x 8 x i16> %wide.masked.load25, %wide.masked.load
-  %8 = getelementptr inbounds i16, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %7, ptr %8, i32 2, <vscale x 8 x i1> %4)
-  %index.next = add i64 %index, %3
-  %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %9 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
-  br i1 %9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB12_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.s, x0, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilewr p1.s, x1, x2
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    whilelo p1.s, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.s
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB12_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
-; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
-; CHECK-NEXT:    add z0.s, z1.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p1, [x2, x8, lsl #2]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.s, x8, x9
-; CHECK-NEXT:    b.mi .LBB12_2
-; CHECK-NEXT:  .LBB12_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_32:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB12_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    add x10, x9, #3
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT:    cmn x9, #3
-; CHECK-NOSVE2-NEXT:    asr x9, x10, #2
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p0.s, xzr, x9
-; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
-; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x10
-; CHECK-NOSVE2-NEXT:    add x10, x9, #3
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT:    cmn x9, #3
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    cset w9, lt
-; CHECK-NOSVE2-NEXT:    asr x10, x10, #2
-; CHECK-NOSVE2-NEXT:    sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p3.s, xzr, x10
-; CHECK-NOSVE2-NEXT:    whilelo p2.s, xzr, x9
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sel p1.b, p3, p3.b, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x9
-; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.s
-; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB12_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    add z0.s, z1.s, z0.s
-; CHECK-NOSVE2-NEXT:    st1w { z0.s }, p1, [x2, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p1.s, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB12_2
-; CHECK-NOSVE2-NEXT:  .LBB12_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp9 = icmp sgt i32 %n, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c12 = ptrtoint ptr %c to i64
-  %a13 = ptrtoint ptr %a to i64
-  %b14 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %a13, %c12
-  %diff = sdiv i64 %sub.diff, 4
-  %neg.compare = icmp slt i64 %sub.diff, -3
-  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
-  %sub.diff16 = sub i64 %b14, %c12
-  %diff17 = sdiv i64 %sub.diff16, 4
-  %neg.compare18 = icmp slt i64 %sub.diff16, -3
-  %.splatinsert19 = insertelement <vscale x 4 x i1> poison, i1 %neg.compare18, i64 0
-  %.splat20 = shufflevector <vscale x 4 x i1> %.splatinsert19, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-  %ptr.diff.lane.mask21 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff17)
-  %active.lane.mask.alias22 = or <vscale x 4 x i1> %ptr.diff.lane.mask21, %.splat20
-  %0 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.alias22
-  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
-  %1 = zext <vscale x 4 x i1> %0 to <vscale x 4 x i8>
-  %2 = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %1)
-  %3 = zext i8 %2 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %4 = and <vscale x 4 x i1> %active.lane.mask, %0
-  %5 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison)
-  %6 = getelementptr inbounds i32, ptr %b, i64 %index
-  %wide.masked.load23 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %6, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison)
-  %7 = add <vscale x 4 x i32> %wide.masked.load23, %wide.masked.load
-  %8 = getelementptr inbounds i32, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %7, ptr %8, i32 4, <vscale x 4 x i1> %4)
-  %index.next = add i64 %index, %3
-  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %9 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
-  br i1 %9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB13_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.d, x0, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilewr p1.d, x1, x2
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    whilelo p1.d, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.d
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB13_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    st1d { z0.d }, p1, [x2, x8, lsl #3]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.d, x8, x9
-; CHECK-NEXT:    b.mi .LBB13_2
-; CHECK-NEXT:  .LBB13_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_64:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB13_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    add x10, x9, #7
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT:    cmn x9, #7
-; CHECK-NOSVE2-NEXT:    asr x9, x10, #3
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p0.d, xzr, x9
-; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x10
-; CHECK-NOSVE2-NEXT:    add x10, x9, #7
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT:    cmn x9, #7
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    cset w9, lt
-; CHECK-NOSVE2-NEXT:    asr x10, x10, #3
-; CHECK-NOSVE2-NEXT:    sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p3.d, xzr, x10
-; CHECK-NOSVE2-NEXT:    whilelo p2.d, xzr, x9
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sel p1.b, p3, p3.b, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x9
-; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.d
-; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB13_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NOSVE2-NEXT:    st1d { z0.d }, p1, [x2, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB13_2
-; CHECK-NOSVE2-NEXT:  .LBB13_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp9 = icmp sgt i32 %n, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c12 = ptrtoint ptr %c to i64
-  %a13 = ptrtoint ptr %a to i64
-  %b14 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %a13, %c12
-  %diff = sdiv i64 %sub.diff, 8
-  %neg.compare = icmp slt i64 %sub.diff, -7
-  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
-  %sub.diff16 = sub i64 %b14, %c12
-  %diff17 = sdiv i64 %sub.diff16, 8
-  %neg.compare18 = icmp slt i64 %sub.diff16, -7
-  %.splatinsert19 = insertelement <vscale x 2 x i1> poison, i1 %neg.compare18, i64 0
-  %.splat20 = shufflevector <vscale x 2 x i1> %.splatinsert19, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-  %ptr.diff.lane.mask21 = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff17)
-  %active.lane.mask.alias22 = or <vscale x 2 x i1> %ptr.diff.lane.mask21, %.splat20
-  %0 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.alias22
-  %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
-  %1 = zext <vscale x 2 x i1> %0 to <vscale x 2 x i8>
-  %2 = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %1)
-  %3 = zext i8 %2 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %4 = and <vscale x 2 x i1> %active.lane.mask, %0
-  %5 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison)
-  %6 = getelementptr inbounds i64, ptr %b, i64 %index
-  %wide.masked.load23 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %6, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison)
-  %7 = add <vscale x 2 x i64> %wide.masked.load23, %wide.masked.load
-  %8 = getelementptr inbounds i64, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %7, ptr %8, i32 8, <vscale x 2 x i1> %4)
-  %index.next = add i64 %index, %3
-  %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %9 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
-  br i1 %9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-declare i64 @llvm.vscale.i64()
-
-declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64)
-
-declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>)
-
-declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>)
-
-declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64)
-
-declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, <vscale x 8 x i1>, <vscale x 8 x i16>)
-
-declare void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16>, ptr nocapture, i32 immarg, <vscale x 8 x i1>)
-
-declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
-
-declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>)
-
-declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, i32 immarg, <vscale x 4 x i1>)
-
-declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64)
-
-declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i64>)
-
-declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr nocapture, i32 immarg, <vscale x 2 x i1>)

>From e984b6120792f90ef00cd9ac23fde16adee73a5a Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 21 Nov 2024 13:53:11 +0000
Subject: [PATCH 02/12] Fix intrinsic signature in docs

---
 llvm/docs/LangRef.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index c9589d5af8ebbe..1ad4b5c3b7e860 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -23477,7 +23477,7 @@ Examples:
 
 .. _int_experimental_get_alias_lane_mask:
 
-'``llvm.get.alias.lane.mask.*``' Intrinsics
+'``llvm.experimental.get.alias.lane.mask.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -23486,10 +23486,10 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <4 x i1> @llvm.experimental.get.alias.lane.mask.v4i1.i64(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
-      declare <8 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
-      declare <16 x i1> @llvm.experimental.get.alias.lane.mask.v16i1.i64(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
-      declare <vscale x 16 x i1> @llvm.experimental.get.alias.lane.mask.nxv16i1.i64(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
+      declare <4 x i1> @llvm.experimental.get.alias.lane.mask.v4i1.i64.i64(i64 %ptrA, i64 %ptrB, i64 immarg %elementSize, i1 immarg %writeAfterRead)
+      declare <8 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %ptrA, i64 %ptrB, i64 immarg %elementSize, i1 immarg %writeAfterRead)
+      declare <16 x i1> @llvm.experimental.get.alias.lane.mask.v16i1.i64.i32(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
+      declare <vscale x 16 x i1> @llvm.experimental.get.alias.lane.mask.nxv16i1.i64.i32(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
 
 
 Overview:

>From 2ea522b993b1f835e519ed51ac6f167556fc3772 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 21 Nov 2024 14:57:06 +0000
Subject: [PATCH 03/12] Add -o - to test run lines

---
 llvm/docs/LangRef.rst                            | 4 ++--
 llvm/test/CodeGen/AArch64/alias_mask.ll          | 4 ++--
 llvm/test/CodeGen/AArch64/alias_mask_scalable.ll | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 1ad4b5c3b7e860..71c1c22cbac026 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -23524,7 +23524,7 @@ Otherwise they are semantically equivalent to:
 
 where ``%m`` is a vector (mask) of active/inactive lanes with its elements
 indexed by ``i``,  and ``%ptrA``, ``%ptrB`` are the two i64 arguments to
-``llvm.experimental.get.alias.lane.mask.*``, ``%elementSize`` is the i32 argument, ``%abs`` is the absolute difference operation, ``%icmp`` is an integer compare and ``ult``
+``llvm.experimental.get.alias.lane.mask.*``, ``%elementSize`` is the first immediate argument, ``%abs`` is the absolute difference operation, ``%icmp`` is an integer compare and ``ult``
 the unsigned less-than comparison operator. The subtraction between ``%ptrA`` and ``%ptrB`` could be negative. The ``%writeAfterRead`` argument is expected to be true if the ``%ptrB`` is stored to after ``%ptrA`` is read from.
 The above is equivalent to:
 
@@ -23551,7 +23551,7 @@ Examples:
 
 .. code-block:: llvm
 
-      %alias.lane.mask = call <4 x i1> @llvm.experimental.get.alias.lane.mask.v4i1.i64(i64 %ptrA, i64 %ptrB, i32 4, i1 1)
+      %alias.lane.mask = call <4 x i1> @llvm.experimental.get.alias.lane.mask.v4i1.i64.i32(i64 %ptrA, i64 %ptrB, i32 4, i1 1)
       %vecA = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptrA, i32 4, <4 x i1> %alias.lane.mask, <4 x i32> poison)
       [...]
       call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, <4 x i32>* %ptrB, i32 4, <4 x i1> %alias.lane.mask)
diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll
index 59ad2bf82db92e..84a22822f1702b 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s | FileCheck %s --check-prefix=CHECK-SVE
-; RUN: llc -mtriple=aarch64 %s | FileCheck %s --check-prefix=CHECK-NOSVE
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE
 
 define <16 x i1> @whilewr_8(i64 %a, i64 %b) {
 ; CHECK-SVE-LABEL: whilewr_8:
diff --git a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
index 4912bc0f59d40d..e4ef5292dee277 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s
 
 define <vscale x 16 x i1> @whilewr_8(i64 %a, i64 %b) {
 ; CHECK-LABEL: whilewr_8:

>From 4c4ab8aeed1b4fd66a90334dfcd889481f7e35b6 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 21 Nov 2024 17:01:47 +0000
Subject: [PATCH 04/12] Format

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  6 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 34 ++++++-----
 .../Target/AArch64/AArch64ISelLowering.cpp    | 58 +++++++++++--------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  3 +-
 4 files changed, 59 insertions(+), 42 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 0338310fd936df..51e716f50814e1 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -468,8 +468,10 @@ class TargetLoweringBase {
     return true;
   }
 
-  /// Return true if the @llvm.experimental.get.alias.lane.mask intrinsic should be expanded using generic code in SelectionDAGBuilder.
-  virtual bool shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT, unsigned EltSize) const {
+  /// Return true if the @llvm.experimental.get.alias.lane.mask intrinsic should
+  /// be expanded using generic code in SelectionDAGBuilder.
+  virtual bool shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT,
+                                            unsigned EltSize) const {
     return true;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 39e84e06a8de60..f942ab05149c23 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8288,17 +8288,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue SourceValue = getValue(I.getOperand(0));
     SDValue SinkValue = getValue(I.getOperand(1));
     SDValue EltSize = getValue(I.getOperand(2));
-    bool IsWriteAfterRead = cast<ConstantSDNode>(getValue(I.getOperand(3)))->getZExtValue() != 0;
+    bool IsWriteAfterRead =
+        cast<ConstantSDNode>(getValue(I.getOperand(3)))->getZExtValue() != 0;
     auto IntrinsicVT = EVT::getEVT(I.getType());
     auto PtrVT = SourceValue->getValueType(0);
 
-    if (!TLI.shouldExpandGetAliasLaneMask(IntrinsicVT, PtrVT, cast<ConstantSDNode>(EltSize)->getSExtValue())) {
+    if (!TLI.shouldExpandGetAliasLaneMask(
+            IntrinsicVT, PtrVT,
+            cast<ConstantSDNode>(EltSize)->getSExtValue())) {
       visitTargetIntrinsic(I, Intrinsic);
       return;
     }
 
-    SDValue Diff = DAG.getNode(ISD::SUB, sdl,
-                              PtrVT, SinkValue, SourceValue);
+    SDValue Diff = DAG.getNode(ISD::SUB, sdl, PtrVT, SinkValue, SourceValue);
     if (!IsWriteAfterRead)
       Diff = DAG.getNode(ISD::ABS, sdl, PtrVT, Diff);
 
@@ -8306,9 +8308,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Zero = DAG.getTargetConstant(0, sdl, PtrVT);
 
     // If the difference is positive then some elements may alias
-    auto CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
-                                   PtrVT);
-    SDValue Cmp = DAG.getSetCC(sdl, CmpVT, Diff, Zero, IsWriteAfterRead ? ISD::SETLE : ISD::SETEQ);
+    auto CmpVT =
+        TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), PtrVT);
+    SDValue Cmp = DAG.getSetCC(sdl, CmpVT, Diff, Zero,
+                               IsWriteAfterRead ? ISD::SETLE : ISD::SETEQ);
 
     // Splat the compare result then OR it with a lane mask
     SDValue Splat = DAG.getSplat(IntrinsicVT, sdl, Cmp);
@@ -8316,14 +8319,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue DiffMask;
     // Don't emit an active lane mask if the target doesn't support it
     if (TLI.shouldExpandGetActiveLaneMask(IntrinsicVT, PtrVT)) {
-        EVT VecTy = EVT::getVectorVT(*DAG.getContext(), PtrVT,
-                                     IntrinsicVT.getVectorElementCount());
-        SDValue DiffSplat = DAG.getSplat(VecTy, sdl, Diff);
-        SDValue VectorStep = DAG.getStepVector(sdl, VecTy);
-        DiffMask = DAG.getSetCC(sdl, IntrinsicVT, VectorStep,
-                                     DiffSplat, ISD::CondCode::SETULT);
+      EVT VecTy = EVT::getVectorVT(*DAG.getContext(), PtrVT,
+                                   IntrinsicVT.getVectorElementCount());
+      SDValue DiffSplat = DAG.getSplat(VecTy, sdl, Diff);
+      SDValue VectorStep = DAG.getStepVector(sdl, VecTy);
+      DiffMask = DAG.getSetCC(sdl, IntrinsicVT, VectorStep, DiffSplat,
+                              ISD::CondCode::SETULT);
     } else {
-      DiffMask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, IntrinsicVT, DAG.getTargetConstant(Intrinsic::get_active_lane_mask, sdl, MVT::i64), Zero, Diff);
+      DiffMask = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, sdl, IntrinsicVT,
+          DAG.getTargetConstant(Intrinsic::get_active_lane_mask, sdl, MVT::i64),
+          Zero, Diff);
     }
     SDValue Or = DAG.getNode(ISD::OR, sdl, IntrinsicVT, DiffMask, Splat);
     setValue(&I, Or);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 66eaec0d5ae6c9..020635400eedae 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2033,7 +2033,8 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
   return false;
 }
 
-bool AArch64TargetLowering::shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT, unsigned EltSize) const {
+bool AArch64TargetLowering::shouldExpandGetAliasLaneMask(
+    EVT VT, EVT PtrVT, unsigned EltSize) const {
   if (!Subtarget->hasSVE2())
     return true;
 
@@ -2042,7 +2043,7 @@ bool AArch64TargetLowering::shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT, unsi
 
   if (VT == MVT::v2i1 || VT == MVT::nxv2i1)
     return EltSize != 8;
-  if( VT == MVT::v4i1 || VT == MVT::nxv4i1)
+  if (VT == MVT::v4i1 || VT == MVT::nxv4i1)
     return EltSize != 4;
   if (VT == MVT::v8i1 || VT == MVT::nxv8i1)
     return EltSize != 2;
@@ -5905,12 +5906,14 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_sve_whilewr_h:
   case Intrinsic::aarch64_sve_whilewr_s:
   case Intrinsic::aarch64_sve_whilewr_d:
-    return DAG.getNode(AArch64ISD::WHILEWR, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2));
+    return DAG.getNode(AArch64ISD::WHILEWR, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::aarch64_sve_whilerw_b:
   case Intrinsic::aarch64_sve_whilerw_h:
   case Intrinsic::aarch64_sve_whilerw_s:
   case Intrinsic::aarch64_sve_whilerw_d:
-    return DAG.getNode(AArch64ISD::WHILERW, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2));
+    return DAG.getNode(AArch64ISD::WHILERW, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::aarch64_neon_abs: {
     EVT Ty = Op.getValueType();
     if (Ty == MVT::i64) {
@@ -6377,34 +6380,38 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       uint64_t EltSize = Op.getOperand(3)->getAsZExtVal();
       bool IsWriteAfterRead = Op.getOperand(4)->getAsZExtVal() == 1;
       switch (EltSize) {
-        case 1:
-          IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_b : Intrinsic::aarch64_sve_whilerw_b;
-          break;
-        case 2:
-          IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_h : Intrinsic::aarch64_sve_whilerw_h;
-          break;
-        case 4:
-          IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_s : Intrinsic::aarch64_sve_whilerw_s;
-          break;
-        case 8:
-          IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_d : Intrinsic::aarch64_sve_whilerw_d;
-          break;
-        default:
-          llvm_unreachable("Unexpected element size for get.alias.lane.mask");
-          break;
+      case 1:
+        IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_b
+                                       : Intrinsic::aarch64_sve_whilerw_b;
+        break;
+      case 2:
+        IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_h
+                                       : Intrinsic::aarch64_sve_whilerw_h;
+        break;
+      case 4:
+        IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_s
+                                       : Intrinsic::aarch64_sve_whilerw_s;
+        break;
+      case 8:
+        IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_d
+                                       : Intrinsic::aarch64_sve_whilerw_d;
+        break;
+      default:
+        llvm_unreachable("Unexpected element size for get.alias.lane.mask");
+        break;
       }
     }
-    SDValue ID =
-        DAG.getTargetConstant(IntrinsicID, dl, MVT::i64);
+    SDValue ID = DAG.getTargetConstant(IntrinsicID, dl, MVT::i64);
 
     EVT VT = Op.getValueType();
     if (VT.isScalableVector())
       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
                          Op.getOperand(2));
 
-    // We can use the SVE whilelo/whilewr/whilerw instruction to lower this intrinsic by
-    // creating the appropriate sequence of scalable vector operations and
-    // then extracting a fixed-width subvector from the scalable vector.
+    // We can use the SVE whilelo/whilewr/whilerw instruction to lower this
+    // intrinsic by creating the appropriate sequence of scalable vector
+    // operations and then extracting a fixed-width subvector from the scalable
+    // vector.
 
     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
     EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
@@ -19544,7 +19551,8 @@ static bool isPredicateCCSettingOp(SDValue N) {
         // get_active_lane_mask is lowered to a whilelo instruction.
         N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask ||
         // get_alias_lane_mask is lowered to a whilewr/rw instruction.
-        N.getConstantOperandVal(0) == Intrinsic::experimental_get_alias_lane_mask)))
+        N.getConstantOperandVal(0) ==
+            Intrinsic::experimental_get_alias_lane_mask)))
     return true;
 
   return false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index b2f766b22911ff..41f56349215903 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -984,7 +984,8 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
 
-  bool shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT, unsigned EltSize) const override;
+  bool shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT,
+                                    unsigned EltSize) const override;
 
   bool
   shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;

>From 99b1f033cfeb68e2f7cd738bf3c14bbedab7d9ff Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 18 Oct 2024 15:49:22 +0100
Subject: [PATCH 05/12] [LV] Mask off possibly aliasing vector lanes

    When vectorising a loop that uses loads and stores, those pointers could
    overlap if their difference is less than the vector factor. For example,
    if address 20 is being stored to and address 23 is being loaded from, they
    overlap when the vector factor is 4 or higher. Currently LoopVectorize
    branches to a scalar loop in these cases with a runtime check. Howver if
    we construct a mask that disables the overlapping (aliasing) lanes then
    the vectorised loop can be safely entered, as long as the loads and
    stores are masked off.
---
 .../llvm/Analysis/TargetTransformInfo.h       |   7 ++
 .../Vectorize/LoopVectorizationPlanner.h      |  22 +++-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  89 +++++++++----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   1 -
 llvm/lib/Transforms/Vectorize/VPlan.h         |  48 +++++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  64 +++++++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  68 ++++++++--
 .../Transforms/Vectorize/VPlanTransforms.h    |   6 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   1 +
 .../LoopVectorize/AArch64/alias_mask.ll       | 117 ++++++++++++++++++
 .../AArch64/induction-costs-sve.ll            |  25 +++-
 .../runtime-check-small-clamped-bounds.ll     |   4 +-
 .../runtime-checks-difference.ll              |  62 +++++-----
 13 files changed, 434 insertions(+), 80 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index e37bce3118bcb2..ffc9c856a24434 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -195,6 +195,13 @@ enum class TailFoldingStyle {
   DataWithEVL,
 };
 
+enum class RTCheckStyle {
+  /// Branch to scalar loop if checks fails at runtime.
+  ScalarFallback,
+  /// Form a mask based on elements which won't be a WAR or RAW hazard
+  UseSafeEltsMask,
+};
+
 struct TailFoldingInfo {
   TargetLibraryInfo *TLI;
   LoopVectorizationLegality *LVL;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a6b5235235ff3b..81f06fcff0cc74 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -418,7 +418,13 @@ class LoopVectorizationPlanner {
   /// Build VPlans for the specified \p UserVF and \p UserIC if they are
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
   /// interleaving should be avoided up-front, no plans are generated.
-  void plan(ElementCount UserVF, unsigned UserIC);
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
+  void plan(ElementCount UserVF, unsigned UserIC,
+            std::optional<ArrayRef<PointerDiffInfo>> DiffChecks,
+            bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
@@ -495,12 +501,22 @@ class LoopVectorizationPlanner {
   /// returned VPlan is valid for. If no VPlan can be built for the input range,
   /// set the largest included VF to the maximum VF for which no plan could be
   /// built.
-  VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range);
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
+  VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range,
+                                        ArrayRef<PointerDiffInfo> RTChecks,
+                                        bool &HasAliasMask);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+  /// RTChecks contains a list of pointer pairs that an alias mask should be
+  /// generated for.
+  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
+                                ArrayRef<PointerDiffInfo> RTChecks,
+                                bool &HasAliasMask);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fda6550a375480..70929039281ecb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -174,6 +174,7 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+STATISTIC(LoopsAliasMasked, "Number of loops predicated with an alias mask");
 
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -1811,6 +1812,10 @@ class GeneratedRTChecks {
   PredicatedScalarEvolution &PSE;
 
 public:
+  /// Set by VPlan when the vector loop should be entered even when runtime
+  /// checks determine that pointers alias within an iteration.
+  bool HasAliasMask = false;
+
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
                     const DataLayout &DL, bool AddBranchWeights)
@@ -1852,9 +1857,11 @@ class GeneratedRTChecks {
 
     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
     if (RtPtrChecking.Need) {
-      auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
-      MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
-                                 "vector.memcheck");
+      if (!MemCheckBlock) {
+        auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
+        MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
+                                   "vector.memcheck");
+      }
 
       auto DiffChecks = RtPtrChecking.getDiffChecks();
       if (DiffChecks) {
@@ -2081,11 +2088,18 @@ class GeneratedRTChecks {
     if (OuterLoop)
       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
 
-    BranchInst &BI =
-        *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
-    if (AddBranchWeights) {
+    // TODO: Branch to the vector preheader conditionally based on the number of
+    // non-aliasing elements. The scalar loop will likely be better if only one
+    // or two elements will be processed per vectorised loop iteration.
+
+    // Jump to the vector preheader unconditionally if it's safe to do so
+    // because an alias mask has been set up.
+    BranchInst &BI = HasAliasMask
+                         ? *BranchInst::Create(LoopVectorPreHeader)
+                         : *BranchInst::Create(Bypass, LoopVectorPreHeader,
+                                               MemRuntimeCheckCond);
+    if (!HasAliasMask && AddBranchWeights)
       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
-    }
     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
     MemCheckBlock->getTerminator()->setDebugLoc(
         Pred->getTerminator()->getDebugLoc());
@@ -2558,7 +2572,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
     });
   }
 
-  LoopBypassBlocks.push_back(MemCheckBlock);
+  /// If an alias mask has been set up then we don't need the bypass as the
+  /// vector preheader will be branched to unconditionally
+  if (!RTChecks.HasAliasMask)
+    LoopBypassBlocks.push_back(MemCheckBlock);
 
   AddedSafetyChecks = true;
 
@@ -7125,7 +7142,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   return VectorizationFactor::Disabled();
 }
 
-void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+void LoopVectorizationPlanner::plan(
+    ElementCount UserVF, unsigned UserIC,
+    std::optional<ArrayRef<PointerDiffInfo>> RTChecks, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -7134,6 +7153,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return;
 
+  ArrayRef<PointerDiffInfo> DiffChecks;
+  if (RTChecks.has_value() && useActiveLaneMask(CM.getTailFoldingStyle(true)))
+    DiffChecks = *RTChecks;
+
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
       !useMaskedInterleavedAccesses(TTI)) {
@@ -7166,7 +7189,7 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.collectInLoopReductions();
       if (CM.selectUserVectorizationFactor(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-        buildVPlansWithVPRecipes(UserVF, UserVF);
+        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks, HasAliasMask);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
       }
@@ -7195,8 +7218,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.collectInstsToScalarize(VF);
   }
 
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
-  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
+                           DiffChecks, HasAliasMask);
+  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
+                           DiffChecks, HasAliasMask);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -7743,7 +7768,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
                              CanonicalIVStartValue, State);
 
   BestVPlan.execute(&State);
-
   // 2.5 Collect reduction resume values.
   auto *ExitVPBB = BestVPlan.getMiddleBlock();
   if (VectorizingEpilogue)
@@ -7975,7 +7999,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
   // reduction phis in the scalar loop preheader.
   if (EPI.SCEVSafetyCheck)
     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
-  if (EPI.MemSafetyCheck)
+  if (EPI.MemSafetyCheck && !RTChecks.HasAliasMask)
     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
 
@@ -8231,9 +8255,8 @@ void VPRecipeBuilder::createHeaderMask() {
 
   VPBuilder::InsertPointGuard Guard(Builder);
   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
-  VPValue *BlockMask = nullptr;
   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
-  BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+  VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
   BlockMaskCache[Header] = BlockMask;
 }
 
@@ -8778,14 +8801,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
-                                                        ElementCount MaxVF) {
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
+    ElementCount MinVF, ElementCount MaxVF, ArrayRef<PointerDiffInfo> RTChecks,
+    bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
-    if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
+    if (auto Plan =
+            tryToBuildVPlanWithVPRecipes(SubRange, RTChecks, HasAliasMask)) {
       // Now optimize the initial VPlan.
       if (!Plan->hasVF(ElementCount::getFixed(1)))
         VPlanTransforms::truncateToMinimalBitwidths(*Plan,
@@ -8818,6 +8843,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 
   VPBuilder Builder(TopRegion->getExitingBasicBlock());
   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
+  // If an alias mask is present, this will be replaced by an increment of the
+  // mask's popcount.
   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
       Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
       "index.next");
@@ -9037,8 +9064,8 @@ static void addExitUsersForFirstOrderRecurrences(
   }
 }
 
-VPlanPtr
-LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
+VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
+    VFRange &Range, ArrayRef<PointerDiffInfo> RTChecks, bool &HasAliasMask) {
 
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
 
@@ -9279,7 +9306,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     bool WithoutRuntimeCheck =
         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
-                                       WithoutRuntimeCheck);
+                                       WithoutRuntimeCheck, PSE, RTChecks);
+    if (ForControlFlow && !RTChecks.empty())
+      HasAliasMask = true;
   }
   return Plan;
 }
@@ -9762,6 +9791,7 @@ static bool processLoopInVPlanNativePath(
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
+
   return true;
 }
 
@@ -10087,18 +10117,23 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
 
+  bool AddBranchWeights =
+      hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
+  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
+                           AddBranchWeights);
+
   // Plan how to best vectorize.
-  LVP.plan(UserVF, UserIC);
+  LVP.plan(UserVF, UserIC,
+           LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(),
+           Checks.HasAliasMask);
   VectorizationFactor VF = LVP.computeBestVF();
+  if (Checks.HasAliasMask)
+    LoopsAliasMasked++;
   unsigned IC = 1;
 
   if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
-  bool AddBranchWeights =
-      hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
-  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
-                           AddBranchWeights);
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8b1a4aeb88f81f..b8988ded3a617f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -952,7 +952,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
-  assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
   unsigned UF = getUF();
   if (VF.getNumUsers()) {
     Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index abfe97b4ab55b6..cfd829654eda38 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -853,6 +853,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     switch (R->getVPDefID()) {
     case VPRecipeBase::VPDerivedIVSC:
     case VPRecipeBase::VPEVLBasedIVPHISC:
+    case VPRecipeBase::VPAliasLaneMaskSC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionEVLSC:
@@ -1226,6 +1227,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // last. The second operand must be a positive constant and <= VF.
     ExtractFromEnd,
     LogicalAnd, // Non-poison propagating logical And.
+    PopCount,
     // Add an offset in bytes (second operand) to a base pointer (first
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
@@ -3073,6 +3075,52 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   }
 };
 
+// Given a pointer A that is being stored to, and pointer B that is being
+// read from, both with unknown lengths, create a mask that disables
+// elements which could overlap across a loop iteration. For example, if A
+// is X and B is X + 2 with VF being 4, only the final two elements of the
+// loaded vector can be stored since they don't overlap with the stored
+// vector. %b.vec = load %b ; = [s, t, u, v]
+// [...]
+// store %a, %b.vec ; only u and v can be stored as their addresses don't
+// overlap with %a + (VF - 1)
+class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
+
+public:
+  VPAliasLaneMaskRecipe(VPValue *Src, VPValue *Sink, unsigned ElementSize)
+      : VPSingleDefRecipe(VPDef::VPAliasLaneMaskSC, {Src, Sink}),
+        ElementSize(ElementSize) {}
+
+  ~VPAliasLaneMaskRecipe() override = default;
+
+  VPAliasLaneMaskRecipe *clone() override {
+    return new VPAliasLaneMaskRecipe(getSourceValue(), getSinkValue(),
+                                     ElementSize);
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPAliasLaneMaskSC);
+
+  void execute(VPTransformState &State) override;
+
+  /// Get the VPValue* for the pointer being read from
+  VPValue *getSourceValue() const { return getOperand(0); }
+
+  // Get the size of the element(s) accessed by the pointers
+  unsigned getAccessedElementSize() const { return ElementSize; }
+
+  /// Get the VPValue* for the pointer being stored to
+  VPValue *getSinkValue() const { return getOperand(1); }
+
+private:
+  unsigned ElementSize;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// Recipe to expand a SCEV expression.
 class VPExpandSCEVRecipe : public VPSingleDefRecipe {
   const SCEV *Expr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ef2ca9af7268d1..7cc5db1a563c73 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -361,6 +361,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::PopCount:
     return true;
   default:
     return false;
@@ -427,6 +428,29 @@ Value *VPInstruction::generate(VPTransformState &State) {
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
+  // Count the number of bits set in each lane and reduce the result to a scalar
+  case VPInstruction::PopCount: {
+    Value *Op = State.get(getOperand(0));
+    auto *VT = Op->getType();
+    Value *Cnt = Op;
+
+    // i1 vectors can just use the add reduction. Bigger elements need a ctpop
+    // first.
+    if (VT->getScalarSizeInBits() > 1)
+      Cnt = Builder.CreateIntrinsic(Intrinsic::ctpop, {VT}, {Cnt});
+
+    auto *VecVT = cast<VectorType>(VT);
+    // Extend to an i8 since i1 is too small to add with
+    if (VecVT->getElementType()->getScalarSizeInBits() < 8) {
+      Cnt = Builder.CreateCast(
+          Instruction::ZExt, Cnt,
+          VectorType::get(Builder.getInt8Ty(), VecVT->getElementCount()));
+    }
+
+    Cnt = Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, Cnt);
+    Cnt = Builder.CreateCast(Instruction::ZExt, Cnt, Builder.getInt64Ty());
+    return Cnt;
+  }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     // Generate code to combine the previous and current values in vector v3.
     //
@@ -644,7 +668,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
-         getOpcode() == VPInstruction::ComputeReductionResult;
+         getOpcode() == VPInstruction::ComputeReductionResult ||
+         getOpcode() == PopCount;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -772,6 +797,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ResumePhi:
     O << "resume-phi";
     break;
+  case VPInstruction::PopCount:
+    O << "popcount";
+    break;
   case VPInstruction::ExplicitVectorLength:
     O << "EXPLICIT-VECTOR-LENGTH";
     break;
@@ -3235,6 +3263,40 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
+  IRBuilderBase Builder = State.Builder;
+  Value *SinkValue = State.get(getSinkValue(), true);
+  Value *SourceValue = State.get(getSourceValue(), true);
+
+  Value *Diff = Builder.CreateSub(SourceValue, SinkValue, "sub.diff");
+  auto *Type = Diff->getType();
+  Value *MemEltSize = ConstantInt::get(Type, ElementSize);
+  Value *DiffDiv = Builder.CreateSDiv(Diff, MemEltSize, "diff");
+  // If the difference is negative then some elements may alias
+  Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_SLE, DiffDiv,
+                                  ConstantInt::get(Type, 0), "neg.compare");
+  // Splat the compare result then OR it with a lane mask
+  Value *Splat = Builder.CreateVectorSplat(State.VF, Cmp);
+  Value *DiffMask = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask,
+      {VectorType::get(Builder.getInt1Ty(), State.VF), Type},
+      {ConstantInt::get(Type, 0), DiffDiv}, nullptr, "ptr.diff.lane.mask");
+  Value *Or = Builder.CreateBinOp(Instruction::BinaryOps::Or, DiffMask, Splat);
+  State.set(this, Or, /*IsScalar=*/false);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPAliasLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  getVPSingleValue()->printAsOperand(O, SlotTracker);
+  O << " = alias lane mask ";
+  getSourceValue()->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getSinkValue()->printAsOperand(O, SlotTracker);
+}
+#endif
+
 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "cannot be used in per-lane");
   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c1b9d6ede51090..3787cfba66e520 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1289,8 +1289,9 @@ void VPlanTransforms::optimize(VPlan &Plan) {
 //   %Negated = Not %ALM
 //   branch-on-cond %Negated
 //
-static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
-    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
+static VPValue *addVPLaneMaskPhiAndUpdateExitBranch(
+    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck,
+    PredicatedScalarEvolution &PSE, ArrayRef<PointerDiffInfo> RTChecks) {
   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
   auto *CanonicalIVPHI = Plan.getCanonicalIV();
@@ -1300,14 +1301,38 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
   // TODO: Check if dropping the flags is needed if
   // !DataAndControlFlowWithoutRuntimeCheck.
+  VPValue *IncVal = CanonicalIVIncrement->getOperand(1);
+  assert(IncVal != CanonicalIVPHI && "Unexpected operand order");
+
   CanonicalIVIncrement->dropPoisonGeneratingFlags();
   DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
+
   // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
   // we have to take unrolling into account. Each part needs to start at
   //   Part * VF
   auto *VecPreheader = Plan.getVectorPreheader();
   VPBuilder Builder(VecPreheader);
 
+  // Create an alias mask for each possibly-aliasing pointer pair. If there
+  // are multiple they are combined together with ANDs.
+  VPValue *AliasMask = nullptr;
+
+  for (auto C : RTChecks) {
+    // FIXME: How to pass this info back?
+    //    HasAliasMask = true;
+    VPValue *Sink =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, C.SinkStart, *PSE.getSE());
+    VPValue *Src =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, C.SrcStart, *PSE.getSE());
+    VPAliasLaneMaskRecipe *M =
+        new VPAliasLaneMaskRecipe(Src, Sink, C.AccessSize);
+    VecPreheader->appendRecipe(M);
+    if (AliasMask)
+      AliasMask = Builder.createAnd(AliasMask, M);
+    else
+      AliasMask = M;
+  }
+
   // Create the ActiveLaneMask instruction using the correct start values.
   VPValue *TC = Plan.getTripCount();
 
@@ -1331,14 +1356,37 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  auto *EntryALM =
+  VPValue *Mask =
       Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
                            DL, "active.lane.mask.entry");
 
   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
   // preheader ActiveLaneMask instruction.
-  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(Mask, DebugLoc());
   LaneMaskPhi->insertAfter(CanonicalIVPHI);
+  VPValue *LaneMask = LaneMaskPhi;
+  if (AliasMask) {
+    // Increment phi by correct amount.
+    Builder.setInsertPoint(CanonicalIVIncrement);
+
+    VPValue *IncrementBy = Builder.createNaryOp(VPInstruction::PopCount,
+                                                {AliasMask}, DL, "popcount");
+    Type *IVType = CanonicalIVPHI->getScalarType();
+
+    if (IVType->getScalarSizeInBits() < 64) {
+      auto *Cast =
+          new VPScalarCastRecipe(Instruction::Trunc, IncrementBy, IVType);
+      Cast->insertAfter(IncrementBy->getDefiningRecipe());
+      IncrementBy = Cast;
+    }
+    CanonicalIVIncrement->setOperand(1, IncrementBy);
+
+    // And the alias mask so the iteration only processes non-aliasing lanes
+    Builder.setInsertPoint(CanonicalIVPHI->getParent(),
+                           CanonicalIVPHI->getParent()->getFirstNonPhi());
+    LaneMask = Builder.createNaryOp(Instruction::BinaryOps::And,
+                                    {LaneMaskPhi, AliasMask}, DL);
+  }
 
   // Create the active lane mask for the next iteration of the loop before the
   // original terminator.
@@ -1357,7 +1405,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
   auto *NotMask = Builder.createNot(ALM, DL);
   Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
   OriginalTerminator->eraseFromParent();
-  return LaneMaskPhi;
+  return LaneMask;
 }
 
 /// Collect all VPValues representing a header mask through the (ICMP_ULE,
@@ -1407,7 +1455,9 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
 
 void VPlanTransforms::addActiveLaneMask(
     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
-    bool DataAndControlFlowWithoutRuntimeCheck) {
+    bool DataAndControlFlowWithoutRuntimeCheck, PredicatedScalarEvolution &PSE,
+    ArrayRef<PointerDiffInfo> RTChecks) {
+
   assert((!DataAndControlFlowWithoutRuntimeCheck ||
           UseActiveLaneMaskForControlFlow) &&
          "DataAndControlFlowWithoutRuntimeCheck implies "
@@ -1416,14 +1466,14 @@ void VPlanTransforms::addActiveLaneMask(
   auto *FoundWidenCanonicalIVUser =
       find_if(Plan.getCanonicalIV()->users(),
               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
-  assert(FoundWidenCanonicalIVUser &&
+  assert(FoundWidenCanonicalIVUser && *FoundWidenCanonicalIVUser &&
          "Must have widened canonical IV when tail folding!");
   auto *WideCanonicalIV =
       cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
-  VPSingleDefRecipe *LaneMask;
+  VPValue *LaneMask;
   if (UseActiveLaneMaskForControlFlow) {
     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
-        Plan, DataAndControlFlowWithoutRuntimeCheck);
+        Plan, DataAndControlFlowWithoutRuntimeCheck, PSE, RTChecks);
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
     LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 11e094db6294f6..30f83bfea97950 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -77,9 +77,13 @@ struct VPlanTransforms {
   /// creation) and instead it is handled using active-lane-mask. \p
   /// DataAndControlFlowWithoutRuntimeCheck implies \p
   /// UseActiveLaneMaskForControlFlow.
+  /// RTChecks refers to the pointer pairs that need aliasing elements to be
+  /// masked off each loop iteration.
   static void addActiveLaneMask(VPlan &Plan,
                                 bool UseActiveLaneMaskForControlFlow,
-                                bool DataAndControlFlowWithoutRuntimeCheck);
+                                bool DataAndControlFlowWithoutRuntimeCheck,
+                                PredicatedScalarEvolution &PSE,
+                                ArrayRef<PointerDiffInfo> RTChecks);
 
   /// Insert truncates and extends for any truncated recipe. Redundant casts
   /// will be folded later.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 691b0d40823cfb..b6b768f101304e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -323,6 +323,7 @@ class VPDef {
   using VPRecipeTy = enum {
     VPBranchOnMaskSC,
     VPDerivedIVSC,
+    VPAliasLaneMaskSC,
     VPExpandSCEVSC,
     VPIRInstructionSC,
     VPInstructionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
new file mode 100644
index 00000000000000..c4aafa97a334ca
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
+
+define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: define dso_local void @alias_mask(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B4:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[C1]], [[B2]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP5]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B4]], [[C3]]
+; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp sle i64 [[DIFF]], 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[DIFF]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[WIDE_TRIP_COUNT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[TMP8]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <vscale x 16 x i1> [[TMP8]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP24:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP23]])
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i8 [[TMP24]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP25]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i32 0
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP29]], [[TMP28]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+  store i8 %add, ptr %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 2f756ab4b0e1ab..071849f995a546 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -139,14 +139,16 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:  entry:
 ; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; PRED-NEXT:    [[SRC3:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; PRED:       vector.memcheck:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
-; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST2]], [[SRC3]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; PRED-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
@@ -156,6 +158,13 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; PRED-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[SRC2]], [[DST1]]
+; PRED-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
+; PRED-NEXT:    [[NEG_COMPARE:%.*]] = icmp sle i64 [[DIFF]], 0
+; PRED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; PRED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
@@ -170,9 +179,10 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP30:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
 ; PRED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP30]], <vscale x 8 x i8> poison)
 ; PRED-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
 ; PRED-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i16> [[TMP20]], [[TMP16]]
 ; PRED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
@@ -181,8 +191,11 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP24]] to <vscale x 8 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
 ; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[TMP30]])
+; PRED-NEXT:    [[TMP31:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
+; PRED-NEXT:    [[TMP32:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP31]])
+; PRED-NEXT:    [[TMP33:%.*]] = zext i8 [[TMP32]] to i64
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP33]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
 ; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i1> [[TMP28]], i32 0
@@ -190,7 +203,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; PRED-NEXT:    br label [[LOOP:%.*]]
 ; PRED:       loop:
 ; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
index bb515cd583e5bc..0288fccb519c10 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
@@ -195,8 +195,8 @@ define void @load_clamped_index_offset_1(ptr %A, ptr %B, i32 %N) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
 ; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
index ecdc4ed416d47e..10e43dd97200b2 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
@@ -77,11 +77,11 @@ define void @different_steps_and_different_access_sizes(ptr %a, ptr %b, i64 %n)
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
-; CHECK-NEXT:    [[N_SHL_1:%.]] = shl i64 %n, 1
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr %a, i64 [[N_SHL_1]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[SCEVGEP4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr %b, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 %n, 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr %a, i64 [[TMP1]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[SCEVGEP1]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr %a, [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
@@ -177,21 +177,22 @@ exit:
 define void @nested_loop_outer_iv_addrec_invariant_in_inner1(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner1(
 ; CHECK:        entry:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[B_GEP_UPPER:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 %n, 2
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr %b, i64 [[TMP0]]
 ; CHECK-NEXT:    br label %outer
 
 ; CHECK:       outer.header:
-; CHECK:         [[OUTER_IV_SHL_2:%.]] = shl i64 %outer.iv, 2
-; CHECK-NEXT:    [[A_GEP_UPPER:%.*]] = getelementptr nuw i8, ptr %a, i64 [[OUTER_IV_SHL_2]]
-; CHECK-NEXT:    [[OUTER_IV_4:%.]] = add i64 [[OUTER_IV_SHL_2]], 4
-; CHECK-NEXT:    [[A_GEP_UPPER_4:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_4]]
-; CHECK:         [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
-
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr nuw i8, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A_GEP_UPPER]], [[B_GEP_UPPER]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr %b, [[A_GEP_UPPER_4]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
 ;
@@ -226,22 +227,23 @@ exit:
 ; sink and source swapped.
 define void @nested_loop_outer_iv_addrec_invariant_in_inner2(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner2(
-; CHECK:        entry:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[B_GEP_UPPER:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
-; CHECK-NEXT:    br label %outer
-
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N:%.*]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header:
-; CHECK:         [[OUTER_IV_SHL_2:%.]] = shl i64 %outer.iv, 2
-; CHECK-NEXT:    [[A_GEP_UPPER:%.*]] = getelementptr nuw i8, ptr %a, i64 [[OUTER_IV_SHL_2]]
-; CHECK-NEXT:    [[OUTER_IV_4:%.]] = add i64 [[OUTER_IV_SHL_2]], 4
-; CHECK-NEXT:    [[A_GEP_UPPER_4:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_4]]
-; CHECK:         [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr nuw i8, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[A_GEP_UPPER_4]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A_GEP_UPPER]], [[B_GEP_UPPER]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
 ;
@@ -280,7 +282,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
 ; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
 ; CHECK:       outer.loop:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
@@ -288,7 +290,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[SUB]], 16
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:

>From 9564ee6e3bc469c492c4cc8e75ed1d45fd367258 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 22 Oct 2024 17:36:20 +0100
Subject: [PATCH 06/12] Don't lower a read-after-write hazard

---
 llvm/include/llvm/Analysis/LoopAccessAnalysis.h   |  5 +++--
 llvm/lib/Analysis/LoopAccessAnalysis.cpp          |  5 ++++-
 llvm/lib/Transforms/Utils/LoopUtils.cpp           |  3 ++-
 llvm/lib/Transforms/Vectorize/VPlan.h             | 10 +++++++---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp    |  6 ++++++
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp |  2 +-
 6 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index a35bc7402d1a89..66796c9a0db90f 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -435,11 +435,12 @@ struct PointerDiffInfo {
   const SCEV *SinkStart;
   unsigned AccessSize;
   bool NeedsFreeze;
+  bool WriteAfterRead;
 
   PointerDiffInfo(const SCEV *SrcStart, const SCEV *SinkStart,
-                  unsigned AccessSize, bool NeedsFreeze)
+                  unsigned AccessSize, bool NeedsFreeze, bool WriteAfterRead)
       : SrcStart(SrcStart), SinkStart(SinkStart), AccessSize(AccessSize),
-        NeedsFreeze(NeedsFreeze) {}
+        NeedsFreeze(NeedsFreeze), WriteAfterRead(WriteAfterRead) {}
 };
 
 /// Holds information about the memory runtime legality checks to verify
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 907bb7875dc807..0a7815eb498788 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -367,11 +367,14 @@ bool RuntimePointerChecking::tryToCreateDiffCheck(
     }
   }
 
+  bool WriteAfterRead = isa<LoadInst>(SrcInsts[0]);
+
   LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
                     << "SrcStart: " << *SrcStartInt << '\n'
                     << "SinkStartInt: " << *SinkStartInt << '\n');
   DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
-                          Src->NeedsFreeze || Sink->NeedsFreeze);
+                          Src->NeedsFreeze || Sink->NeedsFreeze,
+                          WriteAfterRead);
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 70047273c3b9af..9fcbadec084d40 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2004,7 +2004,8 @@ Value *llvm::addDiffRuntimeChecks(
   // Map to keep track of created compares, The key is the pair of operands for
   // the compare, to allow detecting and re-using redundant compares.
   DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
-  for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze] : Checks) {
+  for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze,
+                    WriteAfterRead] : Checks) {
     Type *Ty = SinkStart->getType();
     // Compute VF * IC * AccessSize.
     auto *VFTimesUFTimesSize =
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index cfd829654eda38..6749ed4cf38ba4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3087,15 +3087,16 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
 class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
 
 public:
-  VPAliasLaneMaskRecipe(VPValue *Src, VPValue *Sink, unsigned ElementSize)
+  VPAliasLaneMaskRecipe(VPValue *Src, VPValue *Sink, unsigned ElementSize,
+                        bool WriteAfterRead)
       : VPSingleDefRecipe(VPDef::VPAliasLaneMaskSC, {Src, Sink}),
-        ElementSize(ElementSize) {}
+        ElementSize(ElementSize), WriteAfterRead(WriteAfterRead) {}
 
   ~VPAliasLaneMaskRecipe() override = default;
 
   VPAliasLaneMaskRecipe *clone() override {
     return new VPAliasLaneMaskRecipe(getSourceValue(), getSinkValue(),
-                                     ElementSize);
+                                     ElementSize, WriteAfterRead);
   }
 
   VP_CLASSOF_IMPL(VPDef::VPAliasLaneMaskSC);
@@ -3111,8 +3112,11 @@ class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
   /// Get the VPValue* for the pointer being stored to
   VPValue *getSinkValue() const { return getOperand(1); }
 
+  bool isWriteAfterRead() const { return WriteAfterRead; }
+
 private:
   unsigned ElementSize;
+  bool WriteAfterRead;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7cc5db1a563c73..0cce277d4b844e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3270,6 +3270,10 @@ void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
 
   Value *Diff = Builder.CreateSub(SourceValue, SinkValue, "sub.diff");
   auto *Type = Diff->getType();
+  if (!WriteAfterRead)
+    Diff = Builder.CreateIntrinsic(
+        Intrinsic::abs, {Type},
+        {Diff, ConstantInt::getFalse(Builder.getInt1Ty())});
   Value *MemEltSize = ConstantInt::get(Type, ElementSize);
   Value *DiffDiv = Builder.CreateSDiv(Diff, MemEltSize, "diff");
   // If the difference is negative then some elements may alias
@@ -3294,6 +3298,8 @@ void VPAliasLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
   getSourceValue()->printAsOperand(O, SlotTracker);
   O << ", ";
   getSinkValue()->printAsOperand(O, SlotTracker);
+  O << " (" << (WriteAfterRead ? "write-after-read" : "read-after-write")
+    << ")";
 }
 #endif
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3787cfba66e520..7cd193bae5abdd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1325,7 +1325,7 @@ static VPValue *addVPLaneMaskPhiAndUpdateExitBranch(
     VPValue *Src =
         vputils::getOrCreateVPValueForSCEVExpr(Plan, C.SrcStart, *PSE.getSE());
     VPAliasLaneMaskRecipe *M =
-        new VPAliasLaneMaskRecipe(Src, Sink, C.AccessSize);
+        new VPAliasLaneMaskRecipe(Src, Sink, C.AccessSize, C.WriteAfterRead);
     VecPreheader->appendRecipe(M);
     if (AliasMask)
       AliasMask = Builder.createAnd(AliasMask, M);

>From 9f2fcfcc226b33bb51a49c1a8d71be4620efadef Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 25 Oct 2024 15:58:12 +0100
Subject: [PATCH 07/12] Add getRTCheckStyle and useSafeEltsMask

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 70929039281ecb..b01965ad6e3775 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1396,6 +1396,21 @@ class LoopVectorizationCostModel {
                                : ChosenTailFoldingStyle->second;
   }
 
+  RTCheckStyle getRTCheckStyle(TailFoldingStyle TFStyle) const {
+    switch (TFStyle) {
+    case TailFoldingStyle::Data:
+    case TailFoldingStyle::DataAndControlFlow:
+    case TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck:
+      return RTCheckStyle::UseSafeEltsMask;
+    default:
+      return RTCheckStyle::ScalarFallback;
+    }
+  }
+
+  RTCheckStyle getRTCheckStyle() const {
+    return getRTCheckStyle(getTailFoldingStyle());
+  }
+
   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
   /// overflow or not.
   /// \param IsScalableVF true if scalable vector factors enabled.
@@ -2122,6 +2137,10 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
 }
 
+static bool useSafeEltsMask(TailFoldingStyle TFStyle, RTCheckStyle Style) {
+  return useActiveLaneMask(TFStyle) && Style == RTCheckStyle::UseSafeEltsMask;
+}
+
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
 // vectorization. The loop needs to be annotated with #pragma omp simd
 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -7154,7 +7173,9 @@ void LoopVectorizationPlanner::plan(
     return;
 
   ArrayRef<PointerDiffInfo> DiffChecks;
-  if (RTChecks.has_value() && useActiveLaneMask(CM.getTailFoldingStyle(true)))
+  auto TFStyle = CM.getTailFoldingStyle();
+  if (RTChecks.has_value() &&
+      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle)))
     DiffChecks = *RTChecks;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.

>From df78d685541d1da057f9b7df34375d6c350550ab Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 6 Nov 2024 13:24:45 +0000
Subject: [PATCH 08/12] Compare with zero instead

---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 19 +++++++++++--------
 .../LoopVectorize/AArch64/alias_mask.ll       |  2 +-
 .../AArch64/induction-costs-sve.ll            |  2 +-
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0cce277d4b844e..664e5820c5dad4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3270,21 +3270,24 @@ void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
 
   Value *Diff = Builder.CreateSub(SourceValue, SinkValue, "sub.diff");
   auto *Type = Diff->getType();
+  Value *Zero = ConstantInt::get(Type, 0);
   if (!WriteAfterRead)
     Diff = Builder.CreateIntrinsic(
         Intrinsic::abs, {Type},
-        {Diff, ConstantInt::getFalse(Builder.getInt1Ty())});
-  Value *MemEltSize = ConstantInt::get(Type, ElementSize);
-  Value *DiffDiv = Builder.CreateSDiv(Diff, MemEltSize, "diff");
-  // If the difference is negative then some elements may alias
-  Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_SLE, DiffDiv,
-                                  ConstantInt::get(Type, 0), "neg.compare");
+        {Diff, ConstantInt::getFalse(Builder.getInt1Ty())}, nullptr, "sub.abs");
+
+  Value *DiffDiv = Builder.CreateSDiv(Diff, Zero, "diff");
+  // If the difference is positive then some elements may alias
+  auto CmpCode = WriteAfterRead ? CmpInst::Predicate::ICMP_SLE
+                                : CmpInst::Predicate::ICMP_EQ;
+  Value *Cmp = Builder.CreateICmp(CmpCode, DiffDiv, Zero, "neg.compare");
+
   // Splat the compare result then OR it with a lane mask
   Value *Splat = Builder.CreateVectorSplat(State.VF, Cmp);
   Value *DiffMask = Builder.CreateIntrinsic(
       Intrinsic::get_active_lane_mask,
-      {VectorType::get(Builder.getInt1Ty(), State.VF), Type},
-      {ConstantInt::get(Type, 0), DiffDiv}, nullptr, "ptr.diff.lane.mask");
+      {VectorType::get(Builder.getInt1Ty(), State.VF), Type}, {Zero, DiffDiv},
+      nullptr, "ptr.diff.lane.mask");
   Value *Or = Builder.CreateBinOp(Instruction::BinaryOps::Or, DiffMask, Splat);
   State.set(this, Or, /*IsScalar=*/false);
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
index c4aafa97a334ca..729c159063ba0b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -30,7 +30,7 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
 ; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B4]], [[C3]]
-; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
+; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 0
 ; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp sle i64 [[DIFF]], 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 071849f995a546..198ab1795be7bf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -159,7 +159,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
 ; PRED-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[SRC2]], [[DST1]]
-; PRED-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
+; PRED-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 0
 ; PRED-NEXT:    [[NEG_COMPARE:%.*]] = icmp sle i64 [[DIFF]], 0
 ; PRED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
 ; PRED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer

>From f090625ad3f97c252da3e84e7feaf0eccf4e0bf4 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 6 Nov 2024 13:39:59 +0000
Subject: [PATCH 09/12] Add read-after-write test

---
 .../LoopVectorize/AArch64/alias_mask.ll       | 132 ++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
index 729c159063ba0b..6d2966ada04a99 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -115,3 +115,135 @@ for.body:                                         ; preds = %for.body.preheader,
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
 }
+
+define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: define i32 @alias_mask_read_after_write(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C4:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[C2]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP6]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[C4]], [[B3]]
+; CHECK-NEXT:    [[SUB_ABS:%.*]] = call i64 @llvm.abs.i64(i64 [[SUB_DIFF]], i1 false)
+; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_ABS]], 0
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp eq i64 [[DIFF]], 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[TMP17]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP18]], i32 2, <vscale x 8 x i1> [[TMP16]], <vscale x 8 x i16> poison)
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[WIDE_MASKED_LOAD]], ptr [[TMP20]], i32 2, <vscale x 8 x i1> [[TMP16]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP22]], i32 2, <vscale x 8 x i1> [[TMP16]], <vscale x 8 x i16> poison)
+; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 8 x i16> [[TMP23]], [[WIDE_MASKED_LOAD5]]
+; CHECK-NEXT:    [[TMP25]] = select <vscale x 8 x i1> [[TMP16]], <vscale x 8 x i16> [[TMP24]], <vscale x 8 x i16> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <vscale x 8 x i1> [[TMP9]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[TMP27:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP26]])
+; CHECK-NEXT:    [[TMP28:%.*]] = zext i8 [[TMP27]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP28]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP14]])
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 8 x i1> [[TMP29]], i32 0
+; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP31:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[TMP25]])
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP31]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[ADD9_LCSSA:%.*]] = phi i16 [ [[ADD9:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = zext i16 [[ADD9_LCSSA]] to i32
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[TOTAL_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[TOTAL_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TOTAL_020:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD9]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i16 [[TMP33]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load i16, ptr [[ARRAYIDX6]], align 2
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[TMP33]], [[TOTAL_020]]
+; CHECK-NEXT:    [[ADD9]] = add i16 [[ADD]], [[TMP34]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+entry:
+  %cmp19 = icmp sgt i32 %n, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add9.lcssa = phi i16 [ %add9, %for.body ]
+  %0 = zext i16 %add9.lcssa to i32
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %total.0.lcssa = phi i32 [ 0, %entry ], [ %0, %for.cond.cleanup.loopexit ]
+  ret i32 %total.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %total.020 = phi i16 [ 0, %for.body.preheader ], [ %add9, %for.body ]
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
+  %1 = load i16, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds i16, ptr %c, i64 %indvars.iv
+  store i16 %1, ptr %arrayidx2, align 2
+  %arrayidx6 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
+  %2 = load i16, ptr %arrayidx6, align 2
+  %add = add i16 %1, %total.020
+  %add9 = add i16 %add, %2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}

>From abff583618d909ab8aff0ecb1e67a274ca023da5 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 6 Nov 2024 13:53:33 +0000
Subject: [PATCH 10/12] Simplify tests

---
 .../LoopVectorize/AArch64/alias_mask.ll       | 221 ++++++------------
 1 file changed, 77 insertions(+), 144 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
index 6d2966ada04a99..2869d52bc68b96 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -1,18 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
 
-define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-LABEL: define dso_local void @alias_mask(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B4:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
-; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -24,7 +23,7 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP5]]
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP5]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
@@ -38,10 +37,10 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[WIDE_TRIP_COUNT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[N]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -66,184 +65,118 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP29]], [[TMP28]]
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX6]], align 1
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
-  %cmp11 = icmp sgt i32 %n, 0
-  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+  %cmp11 = icmp sgt i64 %n, 0
+  br i1 %cmp11, label %for.body, label %exit
 
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext nneg i32 %n to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  br label %for.cond.cleanup
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %add = add i8 %load.b, %load.a
+  %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
+  store i8 %add, ptr %gep.c, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
 
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+exit:                                 ; preds = %for.body, %entry
   ret void
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
-  %1 = load i8, ptr %arrayidx2, align 1
-  %add = add i8 %1, %0
-  %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
-  store i8 %add, ptr %arrayidx6, align 1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
 }
 
-define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-LABEL: define i32 @alias_mask_read_after_write(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C4:%.*]] = ptrtoint ptr [[C]] to i64
 ; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
 ; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[C2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP6]]
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[C4]], [[B3]]
 ; CHECK-NEXT:    [[SUB_ABS:%.*]] = call i64 @llvm.abs.i64(i64 [[SUB_DIFF]], i1 false)
 ; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_ABS]], 0
 ; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp eq i64 [[DIFF]], 0
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
-; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[TMP9]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP18]], i32 2, <vscale x 8 x i1> [[TMP16]], <vscale x 8 x i16> poison)
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[WIDE_MASKED_LOAD]], ptr [[TMP20]], i32 2, <vscale x 8 x i1> [[TMP16]])
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[TMP21]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP22]], i32 2, <vscale x 8 x i1> [[TMP16]], <vscale x 8 x i16> poison)
-; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 8 x i16> [[TMP23]], [[WIDE_MASKED_LOAD5]]
-; CHECK-NEXT:    [[TMP25]] = select <vscale x 8 x i1> [[TMP16]], <vscale x 8 x i16> [[TMP24]], <vscale x 8 x i16> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP26:%.*]] = zext <vscale x 8 x i1> [[TMP9]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP26]])
+; CHECK-NEXT:    [[TMP16:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 2, <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP20]], i32 2, <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 2, <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 4 x i32> [[TMP23]], [[WIDE_MASKED_LOAD5]]
+; CHECK-NEXT:    [[TMP25]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <vscale x 4 x i1> [[TMP9]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP27:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP26]])
 ; CHECK-NEXT:    [[TMP28:%.*]] = zext i8 [[TMP27]] to i64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP28]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP14]])
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 8 x i1> [[TMP29]], i32 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 4 x i1> [[TMP29]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP31:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[TMP25]])
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP31]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD9_LCSSA:%.*]] = phi i16 [ [[ADD9:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[TMP32:%.*]] = zext i16 [[ADD9_LCSSA]] to i32
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[TOTAL_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[TOTAL_0_LCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TOTAL_020:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD9]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i16 [[TMP33]], ptr [[ARRAYIDX2]], align 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i16, ptr [[ARRAYIDX6]], align 2
-; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[TMP33]], [[TOTAL_020]]
-; CHECK-NEXT:    [[ADD9]] = add i16 [[ADD]], [[TMP34]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 entry:
-  %cmp19 = icmp sgt i32 %n, 0
-  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+  %cmp19 = icmp sgt i64 %n, 0
+  br i1 %cmp19, label %for.body, label %exit
 
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext nneg i32 %n to i64
-  br label %for.body
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+  %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+  %load.a = load i32, ptr %gep.a, align 2
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %load.a, ptr %gep.c, align 2
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %load.b = load i32, ptr %gep.b, align 2
+  %add = add i32 %load.a, %accum
+  %add2 = add i32 %add, %load.b
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %add9.lcssa = phi i16 [ %add9, %for.body ]
-  %0 = zext i16 %add9.lcssa to i32
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %total.0.lcssa = phi i32 [ 0, %entry ], [ %0, %for.cond.cleanup.loopexit ]
-  ret i32 %total.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %total.020 = phi i16 [ 0, %for.body.preheader ], [ %add9, %for.body ]
-  %arrayidx = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
-  %1 = load i16, ptr %arrayidx, align 2
-  %arrayidx2 = getelementptr inbounds i16, ptr %c, i64 %indvars.iv
-  store i16 %1, ptr %arrayidx2, align 2
-  %arrayidx6 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
-  %2 = load i16, ptr %arrayidx6, align 2
-  %add = add i16 %1, %total.020
-  %add9 = add i16 %add, %2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+exit:                        ; preds = %entry, %for.body
+  %result = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+  ret i32 %result
 }

>From 1035ead0918201c5d10be542f1f448bc62b0d6e9 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 13 Nov 2024 16:52:55 +0000
Subject: [PATCH 11/12] Emit intrinsic instead

---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 29 ++----
 .../LoopVectorize/AArch64/alias_mask.ll       | 95 ++++++++-----------
 .../AArch64/induction-costs-sve.ll            |  8 +-
 3 files changed, 49 insertions(+), 83 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 664e5820c5dad4..4973e9c80bc9c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3268,28 +3268,13 @@ void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
   Value *SinkValue = State.get(getSinkValue(), true);
   Value *SourceValue = State.get(getSourceValue(), true);
 
-  Value *Diff = Builder.CreateSub(SourceValue, SinkValue, "sub.diff");
-  auto *Type = Diff->getType();
-  Value *Zero = ConstantInt::get(Type, 0);
-  if (!WriteAfterRead)
-    Diff = Builder.CreateIntrinsic(
-        Intrinsic::abs, {Type},
-        {Diff, ConstantInt::getFalse(Builder.getInt1Ty())}, nullptr, "sub.abs");
-
-  Value *DiffDiv = Builder.CreateSDiv(Diff, Zero, "diff");
-  // If the difference is positive then some elements may alias
-  auto CmpCode = WriteAfterRead ? CmpInst::Predicate::ICMP_SLE
-                                : CmpInst::Predicate::ICMP_EQ;
-  Value *Cmp = Builder.CreateICmp(CmpCode, DiffDiv, Zero, "neg.compare");
-
-  // Splat the compare result then OR it with a lane mask
-  Value *Splat = Builder.CreateVectorSplat(State.VF, Cmp);
-  Value *DiffMask = Builder.CreateIntrinsic(
-      Intrinsic::get_active_lane_mask,
-      {VectorType::get(Builder.getInt1Ty(), State.VF), Type}, {Zero, DiffDiv},
-      nullptr, "ptr.diff.lane.mask");
-  Value *Or = Builder.CreateBinOp(Instruction::BinaryOps::Or, DiffMask, Splat);
-  State.set(this, Or, /*IsScalar=*/false);
+  auto *Type = SinkValue->getType();
+  Value *AliasMask = Builder.CreateIntrinsic(
+      Intrinsic::get_alias_lane_mask,
+      {VectorType::get(Builder.getInt1Ty(), State.VF), Type},
+      {SourceValue, SinkValue, Builder.getInt32(getAccessedElementSize()), Builder.getInt1(WriteAfterRead)}, nullptr,
+      "alias.lane.mask");
+  State.set(this, AliasMask, /*IsScalar=*/false);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
index 2869d52bc68b96..311d65d5a83314 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -5,10 +5,10 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-LABEL: define dso_local void @alias_mask(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B4:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
@@ -16,29 +16,23 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[C1]], [[B2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[C2]], [[B3]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP7]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP6]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP5]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
-; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B4]], [[C3]]
-; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 0
-; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp sle i64 [[DIFF]], 0
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[DIFF]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[N]], [[TMP10]]
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.alias.lane.mask.nxv16i1.i64(i64 [[B2]], i64 [[C1]], i32 1, i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP8]], 16
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[TMP15]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[N]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -46,21 +40,21 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
 ; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[TMP15]])
-; CHECK-NEXT:    [[TMP23:%.*]] = zext <vscale x 16 x i1> [[TMP8]] to <vscale x 16 x i8>
-; CHECK-NEXT:    [[TMP24:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP23]])
-; CHECK-NEXT:    [[TMP25:%.*]] = zext i8 [[TMP24]] to i64
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP25]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP23:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP28]])
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP23]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP24]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
 ; CHECK-NEXT:    [[TMP26:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i32 0
@@ -91,10 +85,10 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-LABEL: define i32 @alias_mask_read_after_write(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[C4:%.*]] = ptrtoint ptr [[C]] to i64
-; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
 ; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
@@ -103,30 +97,23 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[C2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[B2]], [[C3]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP7]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
-; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[C4]], [[B3]]
-; CHECK-NEXT:    [[SUB_ABS:%.*]] = call i64 @llvm.abs.i64(i64 [[SUB_DIFF]], i1 false)
-; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_ABS]], 0
-; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp eq i64 [[DIFF]], 0
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.alias.lane.mask.nxv4i1.i64(i64 [[C2]], i64 [[B1]], i32 4, i1 false)
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -135,23 +122,23 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[TMP9]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 2, <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP20]], i32 2, <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP20]], i32 2, <vscale x 4 x i1> [[TMP31]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 2, <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 4 x i32> [[TMP23]], [[WIDE_MASKED_LOAD5]]
-; CHECK-NEXT:    [[TMP25]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP26:%.*]] = zext <vscale x 4 x i1> [[TMP9]] to <vscale x 4 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP26]])
-; CHECK-NEXT:    [[TMP28:%.*]] = zext i8 [[TMP27]] to i64
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP28]]
+; CHECK-NEXT:    [[TMP25]] = select <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP32:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP32]])
+; CHECK-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP27]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
 ; CHECK-NEXT:    [[TMP29:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 4 x i1> [[TMP29]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 198ab1795be7bf..1122eeb72472cc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -158,13 +158,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
-; PRED-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[SRC2]], [[DST1]]
-; PRED-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 0
-; PRED-NEXT:    [[NEG_COMPARE:%.*]] = icmp sle i64 [[DIFF]], 0
-; PRED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-; PRED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-; PRED-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = call <vscale x 8 x i1> @llvm.get.alias.lane.mask.nxv8i1.i64(i64 [[SRC2]], i64 [[DST1]], i32 1, i1 true)
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]

>From 3bceff7fa8b192215f00b5904927be2138bc9137 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 21 Nov 2024 16:57:58 +0000
Subject: [PATCH 12/12] Rebase on top of intrinsic patch

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp         | 10 ++++++----
 .../Transforms/LoopVectorize/AArch64/alias_mask.ll     |  4 ++--
 .../LoopVectorize/AArch64/induction-costs-sve.ll       |  2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 4973e9c80bc9c6..c5200fc4a07290 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3270,10 +3270,12 @@ void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
 
   auto *Type = SinkValue->getType();
   Value *AliasMask = Builder.CreateIntrinsic(
-      Intrinsic::get_alias_lane_mask,
-      {VectorType::get(Builder.getInt1Ty(), State.VF), Type},
-      {SourceValue, SinkValue, Builder.getInt32(getAccessedElementSize()), Builder.getInt1(WriteAfterRead)}, nullptr,
-      "alias.lane.mask");
+      Intrinsic::experimental_get_alias_lane_mask,
+      {VectorType::get(Builder.getInt1Ty(), State.VF), Type,
+       Builder.getInt64Ty()},
+      {SourceValue, SinkValue, Builder.getInt64(getAccessedElementSize()),
+       Builder.getInt1(WriteAfterRead)},
+      nullptr, "alias.lane.mask");
   State.set(this, AliasMask, /*IsScalar=*/false);
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
index 311d65d5a83314..ef9de59dba10cd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -28,7 +28,7 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
-; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.alias.lane.mask.nxv16i1.i64(i64 [[B2]], i64 [[C1]], i32 1, i1 true)
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.experimental.get.alias.lane.mask.nxv16i1.i64.i64(i64 [[B2]], i64 [[C1]], i64 1, i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP8]], 16
 ; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[TMP15]]
@@ -109,7 +109,7 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.alias.lane.mask.nxv4i1.i64(i64 [[C2]], i64 [[B1]], i32 4, i1 false)
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.get.alias.lane.mask.nxv4i1.i64.i64(i64 [[C2]], i64 [[B1]], i64 4, i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP16]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 1122eeb72472cc..803c7a5d383704 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -158,7 +158,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = call <vscale x 8 x i1> @llvm.get.alias.lane.mask.nxv8i1.i64(i64 [[SRC2]], i64 [[DST1]], i32 1, i1 true)
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = call <vscale x 8 x i1> @llvm.experimental.get.alias.lane.mask.nxv8i1.i64.i64(i64 [[SRC2]], i64 [[DST1]], i64 1, i1 true)
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]