[llvm] 19e1011 - [SelectionDAG] Fix unsafe cases for loop.dependence.{war/raw}.mask (#168565)

via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 12 00:44:38 PST 2025


Author: Sam Tebbs
Date: 2025-12-12T08:44:33Z
New Revision: 19e1011df51e3f88a02f5522073aa788e479a68b

URL: https://github.com/llvm/llvm-project/commit/19e1011df51e3f88a02f5522073aa788e479a68b
DIFF: https://github.com/llvm/llvm-project/commit/19e1011df51e3f88a02f5522073aa788e479a68b.diff

LOG: [SelectionDAG] Fix unsafe cases for loop.dependence.{war/raw}.mask (#168565)

Both `LOOP_DEPENDENCE_WAR_MASK` and `LOOP_DEPENDENCE_RAW_MASK` are
currently hard to split correctly, and there are a number of incorrect
cases.

The difficulty comes from how the intrinsics are defined. For example,
take `LOOP_DEPENDENCE_WAR_MASK`.

It is defined as the OR of:

* `(ptrB - ptrA) <= 0`
* `elementSize * lane < (ptrB - ptrA)`

Now, if we want to split a loop dependence mask for the high half of the
mask we want to compute:

* `(ptrB - ptrA) <= 0`
* `elementSize * (lane + LoVT.getElementCount()) < (ptrB - ptrA)`

However, with the current opcode definitions, we can only modify ptrA or
ptrB, which may change the result of the first case, which should be
invariant to the lane.

This patch resolves these cases by adding a "lane offset" to the ISD
opcodes. The lane offset is always a constant. For scalable masks, it is
implicitly multiplied by vscale.

This makes splitting trivial as we increment the lane offset by
`LoVT.getElementCount()` now.

Note: In the AArch64 backend, we only support zero lane offsets (as
other cases are tricky to lower to whilewr/rw).

---------

Co-authored-by: Benjamin Maxwell <benjamin.maxwell at arm.com>

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/ISDOpcodes.h
    llvm/include/llvm/Target/TargetSelectionDAG.td
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/SVEInstrFormats.td
    llvm/test/CodeGen/AArch64/alias_mask.ll
    llvm/test/CodeGen/AArch64/alias_mask_nosve.ll
    llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
    llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll

Removed: 
    llvm/test/CodeGen/AArch64/loop-dependence-mask-ccmp.ll


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index b32f3dacbb3a4..2ef7a38019c12 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1569,8 +1569,21 @@ enum NodeType {
   GET_ACTIVE_LANE_MASK,
 
   // The `llvm.loop.dependence.{war, raw}.mask` intrinsics
-  // Operands: Load pointer, Store pointer, Element size
+  // Operands: Load pointer, Store pointer, Element size, Lane offset
   // Output: Mask
+  //
+  // Note: The semantics of these opcodes 
diff er slightly from the intrinsics.
+  // Wherever "lane" (meaning lane index) occurs in the intrinsic definition, it
+  // is replaced with (lane + lane_offset) for the ISD opcode.
+  //
+  //  E.g., for LOOP_DEPENDENCE_WAR_MASK:
+  //    `elementSize * lane < (ptrB - ptrA)`
+  //  Becomes:
+  //    `elementSize * (lane + lane_offset) < (ptrB - ptrA)`
+  //
+  // This is done to allow for trivial splitting of the operation. Note: The
+  // lane offset is always a constant, for scalable masks, it is implicitly
+  // multiplied by vscale.
   LOOP_DEPENDENCE_WAR_MASK,
   LOOP_DEPENDENCE_RAW_MASK,
 

diff  --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index a9750a5ab03f9..e6f14b9f1a402 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -347,6 +347,11 @@ def SDTAtomicLoad : SDTypeProfile<1, 1, [
   SDTCisPtrTy<1>
 ]>;
 
+def SDTLoopDepMask : SDTypeProfile<1, 4,
+  [/*Result=*/SDTCisVec<0>, /*PtrA=*/SDTCisInt<1>, /*PtrB=*/SDTCisInt<2>,
+   /*EltSizeInBytes=*/SDTCisInt<3>, /*LaneOffset=*/SDTCisInt<4>,
+   SDTCisSameAs<2, 1>]>;
+
 class SDCallSeqStart<list<SDTypeConstraint> constraints> :
         SDTypeProfile<0, 2, constraints>;
 class SDCallSeqEnd<list<SDTypeConstraint> constraints> :
@@ -839,10 +844,6 @@ def step_vector : SDNode<"ISD::STEP_VECTOR", SDTypeProfile<1, 1,
                        [SDTCisVec<0>, SDTCisInt<1>]>, []>;
 def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
                               []>;
-
-def SDTLoopDepMask : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>,
-                                   SDTCisSameAs<2, 1>, SDTCisInt<3>,
-                                   SDTCVecEltisVT<0,i1>]>;
 def loop_dependence_war_mask : SDNode<"ISD::LOOP_DEPENDENCE_WAR_MASK",
                                       SDTLoopDepMask, []>;
 def loop_dependence_raw_mask : SDNode<"ISD::LOOP_DEPENDENCE_RAW_MASK",

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index b34928c8e6950..22c5f7dffa80d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1807,46 +1807,41 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) {
 
 SDValue VectorLegalizer::ExpandLOOP_DEPENDENCE_MASK(SDNode *N) {
   SDLoc DL(N);
+  EVT VT = N->getValueType(0);
   SDValue SourceValue = N->getOperand(0);
   SDValue SinkValue = N->getOperand(1);
-  SDValue EltSize = N->getOperand(2);
+  SDValue EltSizeInBytes = N->getOperand(2);
+
+  // Note: The lane offset is scalable if the mask is scalable.
+  ElementCount LaneOffsetEC =
+      ElementCount::get(N->getConstantOperandVal(3), VT.isScalableVT());
 
-  bool IsReadAfterWrite = N->getOpcode() == ISD::LOOP_DEPENDENCE_RAW_MASK;
-  EVT VT = N->getValueType(0);
   EVT PtrVT = SourceValue->getValueType(0);
+  bool IsReadAfterWrite = N->getOpcode() == ISD::LOOP_DEPENDENCE_RAW_MASK;
 
+  // Take the 
diff erence between the pointers and divided by the element size,
+  // to see how many lanes separate them.
   SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue);
   if (IsReadAfterWrite)
     Diff = DAG.getNode(ISD::ABS, DL, PtrVT, Diff);
+  Diff = DAG.getNode(ISD::SDIV, DL, PtrVT, Diff, EltSizeInBytes);
 
-  Diff = DAG.getNode(ISD::SDIV, DL, PtrVT, Diff, EltSize);
-
-  // If the 
diff erence is positive then some elements may alias
-  EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
-                                     Diff.getValueType());
+  // The pointers do not alias if:
+  //  * Diff <= 0 (WAR_MASK)
+  //  * Diff == 0 (RAW_MASK)
+  EVT CmpVT =
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), PtrVT);
   SDValue Zero = DAG.getConstant(0, DL, PtrVT);
   SDValue Cmp = DAG.getSetCC(DL, CmpVT, Diff, Zero,
                              IsReadAfterWrite ? ISD::SETEQ : ISD::SETLE);
 
-  // Create the lane mask
-  EVT SplatVT = VT.changeElementType(PtrVT);
-  SDValue DiffSplat = DAG.getSplat(SplatVT, DL, Diff);
-  SDValue VectorStep = DAG.getStepVector(DL, SplatVT);
-  EVT MaskVT = VT.changeElementType(MVT::i1);
-  SDValue DiffMask =
-      DAG.getSetCC(DL, MaskVT, VectorStep, DiffSplat, ISD::CondCode::SETULT);
+  // The pointers do not alias if:
+  // Lane + LaneOffset < Diff (WAR/RAW_MASK)
+  SDValue LaneOffset = DAG.getElementCount(DL, PtrVT, LaneOffsetEC);
+  SDValue MaskN =
+      DAG.getSelect(DL, PtrVT, Cmp, DAG.getConstant(-1, DL, PtrVT), Diff);
 
-  EVT EltVT = VT.getVectorElementType();
-  // Extend the 
diff  setcc in case the intrinsic has been promoted to a vector
-  // type with elements larger than i1
-  if (EltVT.getScalarSizeInBits() > MaskVT.getScalarSizeInBits())
-    DiffMask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, DiffMask);
-
-  // Splat the compare result then OR it with the lane mask
-  if (CmpVT.getScalarSizeInBits() < EltVT.getScalarSizeInBits())
-    Cmp = DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Cmp);
-  SDValue Splat = DAG.getSplat(VT, DL, Cmp);
-  return DAG.getNode(ISD::OR, DL, VT, DiffMask, Splat);
+  return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, VT, LaneOffset, MaskN);
 }
 
 void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index da3102d30e153..362c1936de208 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -404,19 +404,33 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) {
+  SDLoc DL(N);
   SDValue SourceValue = N->getOperand(0);
   SDValue SinkValue = N->getOperand(1);
-  SDValue EltSize = N->getOperand(2);
+  SDValue EltSizeInBytes = N->getOperand(2);
+  SDValue LaneOffset = N->getOperand(3);
+
   EVT PtrVT = SourceValue->getValueType(0);
-  SDLoc DL(N);
+  bool IsReadAfterWrite = N->getOpcode() == ISD::LOOP_DEPENDENCE_RAW_MASK;
 
+  // Take the 
diff erence between the pointers and divided by the element size,
+  // to see how many lanes separate them.
   SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue);
+  if (IsReadAfterWrite)
+    Diff = DAG.getNode(ISD::ABS, DL, PtrVT, Diff);
+  Diff = DAG.getNode(ISD::SDIV, DL, PtrVT, Diff, EltSizeInBytes);
+
+  // The pointers do not alias if:
+  //  * Diff <= 0 || LaneOffset < Diff (WAR_MASK)
+  //  * Diff == 0 || LaneOffset < abs(Diff) (RAW_MASK)
+  // Note: If LaneOffset is zero, both cases will fold to "true".
   EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
                                      Diff.getValueType());
   SDValue Zero = DAG.getConstant(0, DL, PtrVT);
-  return DAG.getNode(ISD::OR, DL, CmpVT,
-                     DAG.getSetCC(DL, CmpVT, Diff, EltSize, ISD::SETGE),
-                     DAG.getSetCC(DL, CmpVT, Diff, Zero, ISD::SETEQ));
+  SDValue Cmp = DAG.getSetCC(DL, CmpVT, Diff, Zero,
+                             IsReadAfterWrite ? ISD::SETEQ : ISD::SETLE);
+  return DAG.getNode(ISD::OR, DL, CmpVT, Cmp,
+                     DAG.getSetCC(DL, CmpVT, LaneOffset, Diff, ISD::SETULT));
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
@@ -1695,17 +1709,22 @@ void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo,
                                                         SDValue &Hi) {
   SDLoc DL(N);
   EVT LoVT, HiVT;
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   SDValue PtrA = N->getOperand(0);
   SDValue PtrB = N->getOperand(1);
-  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, PtrA, PtrB, N->getOperand(2));
-
-  unsigned EltSize = N->getConstantOperandVal(2);
-  ElementCount Offset = HiVT.getVectorElementCount() * EltSize;
-  SDValue Addend = DAG.getElementCount(DL, MVT::i64, Offset);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
-  PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend);
-  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB, N->getOperand(2));
+  // The lane offset for the "Lo" half of the mask is unchanged.
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, PtrA, PtrB,
+                   /*ElementSizeInBytes=*/N->getOperand(2),
+                   /*LaneOffset=*/N->getOperand(3));
+  // The lane offset for the "Hi" half of the mask is incremented by the number
+  // of elements in the "Lo" half.
+  unsigned LaneOffset =
+      N->getConstantOperandVal(3) + LoVT.getVectorMinNumElements();
+  // Note: The lane offset is implicitly scalable for scalable masks.
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB,
+                   /*ElementSizeInBytes=*/N->getOperand(2),
+                   /*LaneOffset=*/DAG.getConstant(LaneOffset, DL, MVT::i64));
 }
 
 void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
@@ -6050,7 +6069,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) {
   return DAG.getNode(
       N->getOpcode(), SDLoc(N),
       TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)),
-      N->getOperand(0), N->getOperand(1), N->getOperand(2));
+      N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3));
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 05aec6353f924..c34f095e19cbf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8427,13 +8427,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I,
              DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, sdl,
                          EVT::getEVT(I.getType()), getValue(I.getOperand(0)),
-                         getValue(I.getOperand(1)), getValue(I.getOperand(2))));
+                         getValue(I.getOperand(1)), getValue(I.getOperand(2)),
+                         DAG.getConstant(0, sdl, MVT::i64)));
     return;
   case Intrinsic::loop_dependence_raw_mask:
     setValue(&I,
              DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, sdl,
                          EVT::getEVT(I.getType()), getValue(I.getOperand(0)),
-                         getValue(I.getOperand(1)), getValue(I.getOperand(2))));
+                         getValue(I.getOperand(1)), getValue(I.getOperand(2)),
+                         DAG.getConstant(0, sdl, MVT::i64)));
     return;
   }
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 41caa817c11a4..1ade1df88f010 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5440,9 +5440,9 @@ SDValue
 AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  uint64_t EltSize = Op.getConstantOperandVal(2);
   EVT VT = Op.getValueType();
-  switch (EltSize) {
+  SDValue EltSize = Op.getOperand(2);
+  switch (EltSize->getAsZExtVal()) {
   case 1:
     if (VT != MVT::v16i8 && VT != MVT::nxv16i1)
       return SDValue();
@@ -5464,11 +5464,15 @@ AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
     return SDValue();
   }
 
+  SDValue LaneOffset = Op.getOperand(3);
+  if (LaneOffset->getAsZExtVal())
+    return SDValue();
+
   SDValue PtrA = Op.getOperand(0);
   SDValue PtrB = Op.getOperand(1);
 
   if (VT.isScalableVT())
-    return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2));
+    return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, EltSize, LaneOffset);
 
   // We can use the SVE whilewr/whilerw instruction to lower this
   // intrinsic by creating the appropriate sequence of scalable vector
@@ -5480,7 +5484,7 @@ AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
   EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
 
   SDValue Mask =
-      DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2));
+      DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, EltSize, LaneOffset);
   SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
                      DAG.getVectorIdxConstant(0, DL));
@@ -6251,35 +6255,43 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_sve_whilewr_b:
     return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(1, DL, MVT::i64));
+                       DAG.getConstant(1, DL, MVT::i64),
+                       DAG.getConstant(0, DL, MVT::i64));
   case Intrinsic::aarch64_sve_whilewr_h:
     return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(2, DL, MVT::i64));
+                       DAG.getConstant(2, DL, MVT::i64),
+                       DAG.getConstant(0, DL, MVT::i64));
   case Intrinsic::aarch64_sve_whilewr_s:
     return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(4, DL, MVT::i64));
+                       DAG.getConstant(4, DL, MVT::i64),
+                       DAG.getConstant(0, DL, MVT::i64));
   case Intrinsic::aarch64_sve_whilewr_d:
     return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(8, DL, MVT::i64));
+                       DAG.getConstant(8, DL, MVT::i64),
+                       DAG.getConstant(0, DL, MVT::i64));
   case Intrinsic::aarch64_sve_whilerw_b:
     return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(1, DL, MVT::i64));
+                       DAG.getConstant(1, DL, MVT::i64),
+                       DAG.getConstant(0, DL, MVT::i64));
   case Intrinsic::aarch64_sve_whilerw_h:
     return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(2, DL, MVT::i64));
+                       DAG.getConstant(2, DL, MVT::i64),
+                       DAG.getConstant(0, DL, MVT::i64));
   case Intrinsic::aarch64_sve_whilerw_s:
     return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(4, DL, MVT::i64));
+                       DAG.getConstant(4, DL, MVT::i64),
+                       DAG.getConstant(0, DL, MVT::i64));
   case Intrinsic::aarch64_sve_whilerw_d:
     return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(8, DL, MVT::i64));
+                       DAG.getConstant(8, DL, MVT::i64),
+                       DAG.getConstant(0, DL, MVT::i64));
   case Intrinsic::aarch64_neon_abs: {
     EVT Ty = Op.getValueType();
     if (Ty == MVT::i64) {

diff  --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 7a0d3711a2bce..970558c8db52e 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -6020,13 +6020,13 @@ multiclass sve2_int_while_rr<bits<1> rw, string asm, SDPatternOperator op> {
   def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
   def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;
 
-  def : Pat<(nxv16i1 (op i64:$Op1, i64:$Op2, (i64 1))),
+  def : Pat<(nxv16i1 (op i64:$Op1, i64:$Op2, (i64 1), (i64 0))),
             (!cast<Instruction>(NAME # _B) $Op1, $Op2)>;
-  def : Pat<(nxv8i1 (op i64:$Op1, i64:$Op2, (i64 2))),
+  def : Pat<(nxv8i1 (op i64:$Op1, i64:$Op2, (i64 2), (i64 0))),
             (!cast<Instruction>(NAME # _H) $Op1, $Op2)>;
-  def : Pat<(nxv4i1 (op i64:$Op1, i64:$Op2, (i64 4))),
+  def : Pat<(nxv4i1 (op i64:$Op1, i64:$Op2, (i64 4), (i64 0))),
             (!cast<Instruction>(NAME # _S) $Op1, $Op2)>;
-  def : Pat<(nxv2i1 (op i64:$Op1, i64:$Op2, (i64 8))),
+  def : Pat<(nxv2i1 (op i64:$Op1, i64:$Op2, (i64 8), (i64 0))),
             (!cast<Instruction>(NAME # _D) $Op1, $Op2)>;
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll
index fdd0a6a4709da..bf393b6e87710 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask.ll
@@ -100,9 +100,12 @@ entry:
 define <32 x i1> @whilewr_8_split(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_8_split:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x9, x0, #16
+; CHECK-NEXT:    sub x9, x1, x0
+; CHECK-NEXT:    mov w10, #16 // =0x10
+; CHECK-NEXT:    cmp x9, #1
+; CHECK-NEXT:    csinv x9, x9, xzr, ge
 ; CHECK-NEXT:    whilewr p0.b, x0, x1
-; CHECK-NEXT:    whilewr p1.b, x9, x1
+; CHECK-NEXT:    whilelo p1.b, x10, x9
 ; CHECK-NEXT:    adrp x9, .LCPI8_0
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI8_0]
@@ -130,18 +133,21 @@ entry:
 define <64 x i1> @whilewr_8_split2(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_8_split2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x9, x0, #48
+; CHECK-NEXT:    sub x9, x1, x0
+; CHECK-NEXT:    mov w10, #48 // =0x30
+; CHECK-NEXT:    mov w11, #32 // =0x20
+; CHECK-NEXT:    cmp x9, #1
+; CHECK-NEXT:    csinv x9, x9, xzr, ge
 ; CHECK-NEXT:    whilewr p0.b, x0, x1
-; CHECK-NEXT:    add x10, x0, #16
-; CHECK-NEXT:    whilewr p1.b, x9, x1
-; CHECK-NEXT:    add x9, x0, #32
+; CHECK-NEXT:    whilelo p1.b, x10, x9
+; CHECK-NEXT:    mov w10, #16 // =0x10
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    whilewr p0.b, x9, x1
-; CHECK-NEXT:    adrp x9, .LCPI9_0
+; CHECK-NEXT:    whilelo p0.b, x11, x9
 ; CHECK-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    whilewr p1.b, x10, x1
-; CHECK-NEXT:    ldr q4, [x9, :lo12:.LCPI9_0]
+; CHECK-NEXT:    whilelo p1.b, x10, x9
+; CHECK-NEXT:    adrp x9, .LCPI9_0
 ; CHECK-NEXT:    mov z2.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ldr q4, [x9, :lo12:.LCPI9_0]
 ; CHECK-NEXT:    mov z3.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
 ; CHECK-NEXT:    shl v1.16b, v1.16b, #7
@@ -180,44 +186,14 @@ entry:
 define <16 x i1> @whilewr_16_expand(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_16_expand:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    sub x8, x1, x0
 ; CHECK-NEXT:    add x8, x8, x8, lsr #63
 ; CHECK-NEXT:    asr x8, x8, #1
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    mov z6.d, z0.d
-; CHECK-NEXT:    mov z7.d, z0.d
-; CHECK-NEXT:    mov z16.d, z0.d
-; CHECK-NEXT:    dup v3.2d, x8
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT:    add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT:    add z4.d, z4.d, #8 // =0x8
-; CHECK-NEXT:    add z5.d, z5.d, #6 // =0x6
-; CHECK-NEXT:    add z6.d, z6.d, #4 // =0x4
-; CHECK-NEXT:    add z7.d, z7.d, #2 // =0x2
-; CHECK-NEXT:    add z16.d, z16.d, #14 // =0xe
-; CHECK-NEXT:    cmhi v0.2d, v3.2d, v0.2d
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    cmhi v1.2d, v3.2d, v1.2d
-; CHECK-NEXT:    cmhi v2.2d, v3.2d, v2.2d
-; CHECK-NEXT:    cmhi v4.2d, v3.2d, v4.2d
-; CHECK-NEXT:    cmhi v5.2d, v3.2d, v5.2d
-; CHECK-NEXT:    cmhi v6.2d, v3.2d, v6.2d
-; CHECK-NEXT:    cmhi v16.2d, v3.2d, v16.2d
-; CHECK-NEXT:    cmhi v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    uzp1 v2.4s, v4.4s, v2.4s
-; CHECK-NEXT:    uzp1 v4.4s, v6.4s, v5.4s
-; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    dup v1.16b, w8
-; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 2)
@@ -228,81 +204,31 @@ define <32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_16_expand2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub x9, x1, x0
-; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    sub x10, x9, #32
+; CHECK-NEXT:    mov w10, #16 // =0x10
 ; CHECK-NEXT:    add x9, x9, x9, lsr #63
-; CHECK-NEXT:    add x10, x10, x10, lsr #63
 ; CHECK-NEXT:    asr x9, x9, #1
-; CHECK-NEXT:    asr x10, x10, #1
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    mov z6.d, z0.d
-; CHECK-NEXT:    dup v7.2d, x9
-; CHECK-NEXT:    dup v16.2d, x10
-; CHECK-NEXT:    add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT:    add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT:    cmp x10, #1
-; CHECK-NEXT:    add z3.d, z3.d, #8 // =0x8
-; CHECK-NEXT:    add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT:    add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT:    add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT:    cmhi v17.2d, v7.2d, v0.2d
-; CHECK-NEXT:    cmhi v18.2d, v16.2d, v0.2d
-; CHECK-NEXT:    add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT:    cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT:    cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT:    cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT:    cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT:    cmhi v23.2d, v7.2d, v5.2d
-; CHECK-NEXT:    cmhi v24.2d, v7.2d, v6.2d
-; CHECK-NEXT:    cmhi v1.2d, v16.2d, v1.2d
-; CHECK-NEXT:    cmhi v2.2d, v16.2d, v2.2d
-; CHECK-NEXT:    cmhi v3.2d, v16.2d, v3.2d
-; CHECK-NEXT:    cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT:    cmhi v7.2d, v7.2d, v0.2d
-; CHECK-NEXT:    cmhi v5.2d, v16.2d, v5.2d
-; CHECK-NEXT:    cmhi v6.2d, v16.2d, v6.2d
-; CHECK-NEXT:    cset w10, lt
-; CHECK-NEXT:    cmhi v0.2d, v16.2d, v0.2d
-; CHECK-NEXT:    uzp1 v16.4s, v21.4s, v20.4s
 ; CHECK-NEXT:    cmp x9, #1
-; CHECK-NEXT:    uzp1 v20.4s, v23.4s, v22.4s
-; CHECK-NEXT:    uzp1 v17.4s, v17.4s, v24.4s
-; CHECK-NEXT:    cset w9, lt
-; CHECK-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    uzp1 v3.4s, v19.4s, v7.4s
-; CHECK-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; CHECK-NEXT:    uzp1 v5.4s, v18.4s, v6.4s
-; CHECK-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp1 v1.8h, v17.8h, v20.8h
-; CHECK-NEXT:    uzp1 v3.8h, v16.8h, v3.8h
-; CHECK-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
-; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
-; CHECK-NEXT:    dup v2.16b, w9
+; CHECK-NEXT:    csinv x9, x9, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, x10, x9
+; CHECK-NEXT:    whilelo p1.b, xzr, x9
 ; CHECK-NEXT:    adrp x9, .LCPI11_0
-; CHECK-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    dup v3.16b, w10
-; CHECK-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    orr v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI11_0]
-; CHECK-NEXT:    orr v0.16b, v0.16b, v3.16b
-; CHECK-NEXT:    shl v1.16b, v1.16b, #7
+; CHECK-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
-; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT:    shl v1.16b, v1.16b, #7
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    zip1 v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    zip1 v0.16b, v0.16b, v3.16b
-; CHECK-NEXT:    addv h1, v1.8h
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    zip1 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    zip1 v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    str h1, [x8]
+; CHECK-NEXT:    addv h1, v1.8h
 ; CHECK-NEXT:    str h0, [x8, #2]
+; CHECK-NEXT:    str h1, [x8]
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 2)
@@ -312,30 +238,15 @@ entry:
 define <8 x i1> @whilewr_32_expand(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_32_expand:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    subs x8, x1, x0
 ; CHECK-NEXT:    add x9, x8, #3
 ; CHECK-NEXT:    csel x8, x9, x8, mi
 ; CHECK-NEXT:    asr x8, x8, #2
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT:    add z2.d, z2.d, #4 // =0x4
-; CHECK-NEXT:    add z3.d, z3.d, #2 // =0x2
-; CHECK-NEXT:    cmhi v0.2d, v1.2d, v0.2d
-; CHECK-NEXT:    cmhi v4.2d, v1.2d, v4.2d
-; CHECK-NEXT:    cmhi v2.2d, v1.2d, v2.2d
-; CHECK-NEXT:    cmhi v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    uzp1 v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    dup v1.8b, w8
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    xtn v0.8b, v0.8h
-; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
@@ -345,45 +256,15 @@ entry:
 define <16 x i1> @whilewr_32_expand2(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_32_expand2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    subs x8, x1, x0
 ; CHECK-NEXT:    add x9, x8, #3
 ; CHECK-NEXT:    csel x8, x9, x8, mi
 ; CHECK-NEXT:    asr x8, x8, #2
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    mov z6.d, z0.d
-; CHECK-NEXT:    mov z7.d, z0.d
-; CHECK-NEXT:    mov z16.d, z0.d
-; CHECK-NEXT:    dup v3.2d, x8
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT:    add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT:    add z4.d, z4.d, #8 // =0x8
-; CHECK-NEXT:    add z5.d, z5.d, #6 // =0x6
-; CHECK-NEXT:    add z6.d, z6.d, #4 // =0x4
-; CHECK-NEXT:    add z7.d, z7.d, #2 // =0x2
-; CHECK-NEXT:    add z16.d, z16.d, #14 // =0xe
-; CHECK-NEXT:    cmhi v0.2d, v3.2d, v0.2d
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    cmhi v1.2d, v3.2d, v1.2d
-; CHECK-NEXT:    cmhi v2.2d, v3.2d, v2.2d
-; CHECK-NEXT:    cmhi v4.2d, v3.2d, v4.2d
-; CHECK-NEXT:    cmhi v5.2d, v3.2d, v5.2d
-; CHECK-NEXT:    cmhi v6.2d, v3.2d, v6.2d
-; CHECK-NEXT:    cmhi v16.2d, v3.2d, v16.2d
-; CHECK-NEXT:    cmhi v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    uzp1 v2.4s, v4.4s, v2.4s
-; CHECK-NEXT:    uzp1 v4.4s, v6.4s, v5.4s
-; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    dup v1.16b, w8
-; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 4)
@@ -393,85 +274,33 @@ entry:
 define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_32_expand3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x10, x1, x0
-; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    sub x9, x10, #61
-; CHECK-NEXT:    subs x11, x10, #64
-; CHECK-NEXT:    add x12, x10, #3
-; CHECK-NEXT:    csel x9, x9, x11, mi
-; CHECK-NEXT:    asr x11, x9, #2
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    cmp x11, #1
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    cset w9, lt
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    mov z6.d, z0.d
-; CHECK-NEXT:    csel x10, x12, x10, mi
-; CHECK-NEXT:    dup v7.2d, x11
-; CHECK-NEXT:    add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT:    asr x10, x10, #2
-; CHECK-NEXT:    add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT:    add z3.d, z3.d, #8 // =0x8
-; CHECK-NEXT:    add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT:    add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT:    add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT:    dup v16.2d, x10
-; CHECK-NEXT:    cmhi v17.2d, v7.2d, v0.2d
-; CHECK-NEXT:    cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT:    cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT:    cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT:    cmp x10, #1
-; CHECK-NEXT:    cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT:    cset w10, lt
-; CHECK-NEXT:    cmhi v18.2d, v16.2d, v0.2d
-; CHECK-NEXT:    add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT:    cmhi v1.2d, v16.2d, v1.2d
-; CHECK-NEXT:    cmhi v2.2d, v16.2d, v2.2d
-; CHECK-NEXT:    cmhi v3.2d, v16.2d, v3.2d
-; CHECK-NEXT:    cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT:    cmhi v23.2d, v16.2d, v5.2d
-; CHECK-NEXT:    cmhi v24.2d, v16.2d, v6.2d
-; CHECK-NEXT:    cmhi v5.2d, v7.2d, v5.2d
-; CHECK-NEXT:    cmhi v16.2d, v16.2d, v0.2d
-; CHECK-NEXT:    cmhi v6.2d, v7.2d, v6.2d
-; CHECK-NEXT:    cmhi v0.2d, v7.2d, v0.2d
-; CHECK-NEXT:    uzp1 v7.4s, v21.4s, v20.4s
-; CHECK-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    uzp1 v3.4s, v23.4s, v4.4s
-; CHECK-NEXT:    uzp1 v4.4s, v18.4s, v24.4s
-; CHECK-NEXT:    uzp1 v5.4s, v5.4s, v22.4s
-; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    uzp1 v6.4s, v17.4s, v6.4s
-; CHECK-NEXT:    uzp1 v0.4s, v19.4s, v0.4s
-; CHECK-NEXT:    uzp1 v3.8h, v4.8h, v3.8h
-; CHECK-NEXT:    uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT:    uzp1 v2.8h, v6.8h, v5.8h
-; CHECK-NEXT:    uzp1 v0.8h, v7.8h, v0.8h
-; CHECK-NEXT:    uzp1 v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    uzp1 v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    dup v3.16b, w10
-; CHECK-NEXT:    dup v2.16b, w9
+; CHECK-NEXT:    subs x9, x1, x0
+; CHECK-NEXT:    add x10, x9, #3
+; CHECK-NEXT:    csel x9, x10, x9, mi
+; CHECK-NEXT:    mov w10, #16 // =0x10
+; CHECK-NEXT:    asr x9, x9, #2
+; CHECK-NEXT:    cmp x9, #1
+; CHECK-NEXT:    csinv x9, x9, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, x10, x9
+; CHECK-NEXT:    whilelo p1.b, xzr, x9
 ; CHECK-NEXT:    adrp x9, .LCPI14_0
-; CHECK-NEXT:    orr v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI14_0]
-; CHECK-NEXT:    shl v1.16b, v1.16b, #7
+; CHECK-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
-; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT:    shl v1.16b, v1.16b, #7
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    zip1 v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    zip1 v0.16b, v0.16b, v3.16b
-; CHECK-NEXT:    addv h1, v1.8h
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    zip1 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    zip1 v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    str h1, [x8]
+; CHECK-NEXT:    addv h1, v1.8h
 ; CHECK-NEXT:    str h0, [x8, #2]
+; CHECK-NEXT:    str h1, [x8]
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 4)
@@ -481,22 +310,15 @@ entry:
 define <4 x i1> @whilewr_64_expand(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_64_expand:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    subs x8, x1, x0
 ; CHECK-NEXT:    add x9, x8, #7
 ; CHECK-NEXT:    csel x8, x9, x8, mi
 ; CHECK-NEXT:    asr x8, x8, #3
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    dup v2.2d, x8
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    add z1.d, z1.d, #2 // =0x2
-; CHECK-NEXT:    cmhi v0.2d, v2.2d, v0.2d
-; CHECK-NEXT:    cmhi v1.2d, v2.2d, v1.2d
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    dup v1.4h, w8
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 8)
@@ -506,30 +328,15 @@ entry:
 define <8 x i1> @whilewr_64_expand2(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_64_expand2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    subs x8, x1, x0
 ; CHECK-NEXT:    add x9, x8, #7
 ; CHECK-NEXT:    csel x8, x9, x8, mi
 ; CHECK-NEXT:    asr x8, x8, #3
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT:    add z2.d, z2.d, #4 // =0x4
-; CHECK-NEXT:    add z3.d, z3.d, #2 // =0x2
-; CHECK-NEXT:    cmhi v0.2d, v1.2d, v0.2d
-; CHECK-NEXT:    cmhi v4.2d, v1.2d, v4.2d
-; CHECK-NEXT:    cmhi v2.2d, v1.2d, v2.2d
-; CHECK-NEXT:    cmhi v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    uzp1 v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    dup v1.8b, w8
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    xtn v0.8b, v0.8h
-; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 8)
@@ -539,45 +346,15 @@ entry:
 define <16 x i1> @whilewr_64_expand3(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_64_expand3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    subs x8, x1, x0
 ; CHECK-NEXT:    add x9, x8, #7
 ; CHECK-NEXT:    csel x8, x9, x8, mi
 ; CHECK-NEXT:    asr x8, x8, #3
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    mov z6.d, z0.d
-; CHECK-NEXT:    mov z7.d, z0.d
-; CHECK-NEXT:    mov z16.d, z0.d
-; CHECK-NEXT:    dup v3.2d, x8
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT:    add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT:    add z4.d, z4.d, #8 // =0x8
-; CHECK-NEXT:    add z5.d, z5.d, #6 // =0x6
-; CHECK-NEXT:    add z6.d, z6.d, #4 // =0x4
-; CHECK-NEXT:    add z7.d, z7.d, #2 // =0x2
-; CHECK-NEXT:    add z16.d, z16.d, #14 // =0xe
-; CHECK-NEXT:    cmhi v0.2d, v3.2d, v0.2d
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    cmhi v1.2d, v3.2d, v1.2d
-; CHECK-NEXT:    cmhi v2.2d, v3.2d, v2.2d
-; CHECK-NEXT:    cmhi v4.2d, v3.2d, v4.2d
-; CHECK-NEXT:    cmhi v5.2d, v3.2d, v5.2d
-; CHECK-NEXT:    cmhi v6.2d, v3.2d, v6.2d
-; CHECK-NEXT:    cmhi v16.2d, v3.2d, v16.2d
-; CHECK-NEXT:    cmhi v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    uzp1 v2.4s, v4.4s, v2.4s
-; CHECK-NEXT:    uzp1 v4.4s, v6.4s, v5.4s
-; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    dup v1.16b, w8
-; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
@@ -587,85 +364,33 @@ entry:
 define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_64_expand4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x10, x1, x0
-; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    sub x9, x10, #121
-; CHECK-NEXT:    subs x11, x10, #128
-; CHECK-NEXT:    add x12, x10, #7
-; CHECK-NEXT:    csel x9, x9, x11, mi
-; CHECK-NEXT:    asr x11, x9, #3
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    cmp x11, #1
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    cset w9, lt
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    mov z6.d, z0.d
-; CHECK-NEXT:    csel x10, x12, x10, mi
-; CHECK-NEXT:    dup v7.2d, x11
-; CHECK-NEXT:    add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT:    asr x10, x10, #3
-; CHECK-NEXT:    add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT:    add z3.d, z3.d, #8 // =0x8
-; CHECK-NEXT:    add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT:    add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT:    add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT:    dup v16.2d, x10
-; CHECK-NEXT:    cmhi v17.2d, v7.2d, v0.2d
-; CHECK-NEXT:    cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT:    cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT:    cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT:    cmp x10, #1
-; CHECK-NEXT:    cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT:    cset w10, lt
-; CHECK-NEXT:    cmhi v18.2d, v16.2d, v0.2d
-; CHECK-NEXT:    add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT:    cmhi v1.2d, v16.2d, v1.2d
-; CHECK-NEXT:    cmhi v2.2d, v16.2d, v2.2d
-; CHECK-NEXT:    cmhi v3.2d, v16.2d, v3.2d
-; CHECK-NEXT:    cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT:    cmhi v23.2d, v16.2d, v5.2d
-; CHECK-NEXT:    cmhi v24.2d, v16.2d, v6.2d
-; CHECK-NEXT:    cmhi v5.2d, v7.2d, v5.2d
-; CHECK-NEXT:    cmhi v16.2d, v16.2d, v0.2d
-; CHECK-NEXT:    cmhi v6.2d, v7.2d, v6.2d
-; CHECK-NEXT:    cmhi v0.2d, v7.2d, v0.2d
-; CHECK-NEXT:    uzp1 v7.4s, v21.4s, v20.4s
-; CHECK-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    uzp1 v3.4s, v23.4s, v4.4s
-; CHECK-NEXT:    uzp1 v4.4s, v18.4s, v24.4s
-; CHECK-NEXT:    uzp1 v5.4s, v5.4s, v22.4s
-; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    uzp1 v6.4s, v17.4s, v6.4s
-; CHECK-NEXT:    uzp1 v0.4s, v19.4s, v0.4s
-; CHECK-NEXT:    uzp1 v3.8h, v4.8h, v3.8h
-; CHECK-NEXT:    uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT:    uzp1 v2.8h, v6.8h, v5.8h
-; CHECK-NEXT:    uzp1 v0.8h, v7.8h, v0.8h
-; CHECK-NEXT:    uzp1 v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    uzp1 v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    dup v3.16b, w10
-; CHECK-NEXT:    dup v2.16b, w9
+; CHECK-NEXT:    subs x9, x1, x0
+; CHECK-NEXT:    add x10, x9, #7
+; CHECK-NEXT:    csel x9, x10, x9, mi
+; CHECK-NEXT:    mov w10, #16 // =0x10
+; CHECK-NEXT:    asr x9, x9, #3
+; CHECK-NEXT:    cmp x9, #1
+; CHECK-NEXT:    csinv x9, x9, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, x10, x9
+; CHECK-NEXT:    whilelo p1.b, xzr, x9
 ; CHECK-NEXT:    adrp x9, .LCPI18_0
-; CHECK-NEXT:    orr v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI18_0]
-; CHECK-NEXT:    shl v1.16b, v1.16b, #7
+; CHECK-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
-; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT:    shl v1.16b, v1.16b, #7
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    zip1 v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    zip1 v0.16b, v0.16b, v3.16b
-; CHECK-NEXT:    addv h1, v1.8h
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    zip1 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    zip1 v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    str h1, [x8]
+; CHECK-NEXT:    addv h1, v1.8h
 ; CHECK-NEXT:    str h0, [x8, #2]
+; CHECK-NEXT:    str h1, [x8]
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 8)
@@ -743,44 +468,14 @@ define <16 x i1> @whilewr_badimm(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov x8, #6148914691236517205 // =0x5555555555555555
 ; CHECK-NEXT:    sub x9, x1, x0
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    movk x8, #21846
 ; CHECK-NEXT:    smulh x8, x9, x8
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    mov z6.d, z0.d
-; CHECK-NEXT:    mov z7.d, z0.d
-; CHECK-NEXT:    mov z16.d, z0.d
 ; CHECK-NEXT:    add x8, x8, x8, lsr #63
-; CHECK-NEXT:    add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT:    add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT:    add z4.d, z4.d, #8 // =0x8
-; CHECK-NEXT:    add z5.d, z5.d, #6 // =0x6
-; CHECK-NEXT:    add z6.d, z6.d, #4 // =0x4
-; CHECK-NEXT:    dup v3.2d, x8
-; CHECK-NEXT:    add z16.d, z16.d, #14 // =0xe
-; CHECK-NEXT:    add z7.d, z7.d, #2 // =0x2
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    cmhi v0.2d, v3.2d, v0.2d
-; CHECK-NEXT:    cmhi v1.2d, v3.2d, v1.2d
-; CHECK-NEXT:    cmhi v2.2d, v3.2d, v2.2d
-; CHECK-NEXT:    cmhi v4.2d, v3.2d, v4.2d
-; CHECK-NEXT:    cmhi v16.2d, v3.2d, v16.2d
-; CHECK-NEXT:    cmhi v5.2d, v3.2d, v5.2d
-; CHECK-NEXT:    cmhi v6.2d, v3.2d, v6.2d
-; CHECK-NEXT:    cmhi v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    uzp1 v2.4s, v4.4s, v2.4s
-; CHECK-NEXT:    uzp1 v4.4s, v6.4s, v5.4s
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    dup v1.16b, w8
-; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 3)
@@ -792,9 +487,7 @@ entry:
 define <1 x i1> @whilewr_8_scalarize(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_8_scalarize:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x8, x1, x0
-; CHECK-NEXT:    cmn x8, #1
-; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 1)
@@ -804,10 +497,7 @@ entry:
 define <1 x i1> @whilewr_16_scalarize(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_16_scalarize:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x8, x1, x0
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    ccmp x8, #0, #4, le
-; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 2)
@@ -817,10 +507,7 @@ entry:
 define <1 x i1> @whilewr_32_scalarize(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_32_scalarize:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x8, x1, x0
-; CHECK-NEXT:    cmp x8, #3
-; CHECK-NEXT:    ccmp x8, #0, #4, le
-; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 4)
@@ -830,10 +517,7 @@ entry:
 define <1 x i1> @whilewr_64_scalarize(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_64_scalarize:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x8, x1, x0
-; CHECK-NEXT:    cmp x8, #7
-; CHECK-NEXT:    ccmp x8, #0, #4, le
-; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 8)
@@ -843,9 +527,7 @@ entry:
 define <1 x i1> @whilerw_8_scalarize(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilerw_8_scalarize:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x8, x1, x0
-; CHECK-NEXT:    cmn x8, #1
-; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 1)
@@ -855,10 +537,7 @@ entry:
 define <1 x i1> @whilerw_16_scalarize(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilerw_16_scalarize:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x8, x1, x0
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    ccmp x8, #0, #4, le
-; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 2)
@@ -868,10 +547,7 @@ entry:
 define <1 x i1> @whilerw_32_scalarize(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilerw_32_scalarize:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x8, x1, x0
-; CHECK-NEXT:    cmp x8, #3
-; CHECK-NEXT:    ccmp x8, #0, #4, le
-; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 4)
@@ -881,10 +557,7 @@ entry:
 define <1 x i1> @whilerw_64_scalarize(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilerw_64_scalarize:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x8, x1, x0
-; CHECK-NEXT:    cmp x8, #7
-; CHECK-NEXT:    ccmp x8, #0, #4, le
-; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 8)

diff  --git a/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll b/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll
index 922b37c2f2a08..0b1221244a757 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll
@@ -1,47 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
 
+; TODO: Currently lowering get_active_lane_mask requires +sve
+; XFAIL: *
+
 define <16 x i1> @whilewr_8(ptr %a, ptr %b) {
-; CHECK-LABEL: whilewr_8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x8, .LCPI0_0
-; CHECK-NEXT:    adrp x10, .LCPI0_1
-; CHECK-NEXT:    sub x9, x1, x0
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI0_0]
-; CHECK-NEXT:    adrp x8, .LCPI0_2
-; CHECK-NEXT:    ldr q1, [x10, :lo12:.LCPI0_1]
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI0_2]
-; CHECK-NEXT:    adrp x8, .LCPI0_4
-; CHECK-NEXT:    adrp x10, .LCPI0_3
-; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI0_4]
-; CHECK-NEXT:    adrp x8, .LCPI0_5
-; CHECK-NEXT:    dup v2.2d, x9
-; CHECK-NEXT:    ldr q4, [x10, :lo12:.LCPI0_3]
-; CHECK-NEXT:    adrp x10, .LCPI0_6
-; CHECK-NEXT:    ldr q6, [x8, :lo12:.LCPI0_5]
-; CHECK-NEXT:    adrp x8, .LCPI0_7
-; CHECK-NEXT:    ldr q7, [x10, :lo12:.LCPI0_6]
-; CHECK-NEXT:    cmp x9, #1
-; CHECK-NEXT:    ldr q16, [x8, :lo12:.LCPI0_7]
-; CHECK-NEXT:    cmhi v0.2d, v2.2d, v0.2d
-; CHECK-NEXT:    cmhi v1.2d, v2.2d, v1.2d
-; CHECK-NEXT:    cmhi v3.2d, v2.2d, v3.2d
-; CHECK-NEXT:    cmhi v4.2d, v2.2d, v4.2d
-; CHECK-NEXT:    cmhi v5.2d, v2.2d, v5.2d
-; CHECK-NEXT:    cmhi v6.2d, v2.2d, v6.2d
-; CHECK-NEXT:    cmhi v7.2d, v2.2d, v7.2d
-; CHECK-NEXT:    cmhi v2.2d, v2.2d, v16.2d
-; CHECK-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    uzp1 v1.4s, v4.4s, v3.4s
-; CHECK-NEXT:    uzp1 v3.4s, v6.4s, v5.4s
-; CHECK-NEXT:    uzp1 v2.4s, v2.4s, v7.4s
-; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    dup v1.16b, w8
-; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
 entry:
   %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
   ret <16 x i1> %0

diff  --git a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
index 3435ceca28e17..8a2eff3fde396 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll
@@ -84,9 +84,12 @@ entry:
 define <vscale x 32 x i1> @whilewr_8_split(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_8_split:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub x9, x1, x0
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    cmp x9, #1
+; CHECK-NEXT:    csinv x9, x9, xzr, ge
 ; CHECK-NEXT:    whilewr p0.b, x0, x1
-; CHECK-NEXT:    incb x0
-; CHECK-NEXT:    whilewr p1.b, x0, x1
+; CHECK-NEXT:    whilelo p1.b, x8, x9
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 32 x i1> @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 1)
@@ -96,14 +99,16 @@ entry:
 define <vscale x 64 x i1> @whilewr_8_split2(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_8_split2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    sub x9, x1, x0
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    rdvl x10, #2
+; CHECK-NEXT:    cmp x9, #1
+; CHECK-NEXT:    csinv x9, x9, xzr, ge
 ; CHECK-NEXT:    whilewr p0.b, x0, x1
-; CHECK-NEXT:    addvl x9, x0, #3
-; CHECK-NEXT:    incb x0, all, mul #2
-; CHECK-NEXT:    incb x8
-; CHECK-NEXT:    whilewr p3.b, x9, x1
-; CHECK-NEXT:    whilewr p2.b, x0, x1
-; CHECK-NEXT:    whilewr p1.b, x8, x1
+; CHECK-NEXT:    whilelo p1.b, x8, x9
+; CHECK-NEXT:    rdvl x8, #3
+; CHECK-NEXT:    whilelo p2.b, x10, x9
+; CHECK-NEXT:    whilelo p3.b, x8, x9
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 64 x i1> @llvm.loop.dependence.war.mask.nxv64i1(ptr %a, ptr %b, i64 1)
@@ -113,58 +118,12 @@ entry:
 define <vscale x 16 x i1> @whilewr_16_expand(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_16_expand:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    sub x8, x1, x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x8, x8, x8, lsr #63
 ; CHECK-NEXT:    asr x8, x8, #1
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    incd z1.d
-; CHECK-NEXT:    incd z4.d, all, mul #2
-; CHECK-NEXT:    incd z5.d, all, mul #4
-; CHECK-NEXT:    cmphi p2.d, p0/z, z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    cmphi p1.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    incd z1.d, all, mul #4
-; CHECK-NEXT:    cmphi p3.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    incd z4.d, all, mul #4
-; CHECK-NEXT:    cmphi p4.d, p0/z, z2.d, z5.d
-; CHECK-NEXT:    incd z3.d, all, mul #2
-; CHECK-NEXT:    cmphi p5.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    cmphi p7.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
-; CHECK-NEXT:    mov z0.d, z3.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z2.d, z3.d
-; CHECK-NEXT:    uzp1 p2.s, p4.s, p5.s
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT:    incd z0.d, all, mul #4
-; CHECK-NEXT:    uzp1 p3.s, p3.s, p6.s
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
-; CHECK-NEXT:    cmphi p0.d, p0/z, z2.d, z0.d
-; CHECK-NEXT:    uzp1 p1.h, p1.h, p3.h
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    uzp1 p0.s, p7.s, p0.s
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p0.h, p2.h, p0.h
-; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
-; CHECK-NEXT:    whilelo p1.b, xzr, x8
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 2)
@@ -174,89 +133,14 @@ entry:
 define <vscale x 32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_16_expand2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    sub x8, x1, x0
-; CHECK-NEXT:    incb x0, all, mul #2
-; CHECK-NEXT:    add x8, x8, x8, lsr #63
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    asr x8, x8, #1
 ; CHECK-NEXT:    sub x9, x1, x0
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z5.d, x8
+; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    add x9, x9, x9, lsr #63
-; CHECK-NEXT:    incd z1.d
-; CHECK-NEXT:    incd z2.d, all, mul #2
-; CHECK-NEXT:    incd z3.d, all, mul #4
-; CHECK-NEXT:    cmphi p2.d, p0/z, z5.d, z0.d
 ; CHECK-NEXT:    asr x9, x9, #1
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    mov z7.d, z2.d
-; CHECK-NEXT:    cmphi p1.d, p0/z, z5.d, z1.d
-; CHECK-NEXT:    cmphi p3.d, p0/z, z5.d, z3.d
-; CHECK-NEXT:    cmphi p5.d, p0/z, z5.d, z2.d
-; CHECK-NEXT:    incd z4.d, all, mul #2
-; CHECK-NEXT:    incd z6.d, all, mul #4
-; CHECK-NEXT:    incd z7.d, all, mul #4
-; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
-; CHECK-NEXT:    mov z24.d, z4.d
-; CHECK-NEXT:    cmphi p4.d, p0/z, z5.d, z6.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    cmphi p7.d, p0/z, z5.d, z7.d
-; CHECK-NEXT:    incd z24.d, all, mul #4
-; CHECK-NEXT:    uzp1 p2.s, p3.s, p4.s
-; CHECK-NEXT:    uzp1 p3.s, p5.s, p6.s
-; CHECK-NEXT:    cmphi p8.d, p0/z, z5.d, z24.d
-; CHECK-NEXT:    mov z5.d, x9
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    uzp1 p1.h, p1.h, p3.h
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    cmphi p4.d, p0/z, z5.d, z24.d
-; CHECK-NEXT:    cmphi p5.d, p0/z, z5.d, z7.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z5.d, z6.d
-; CHECK-NEXT:    uzp1 p7.s, p7.s, p8.s
-; CHECK-NEXT:    cmphi p9.d, p0/z, z5.d, z3.d
-; CHECK-NEXT:    cmphi p3.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    cmphi p8.d, p0/z, z5.d, z2.d
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    uzp1 p2.h, p2.h, p7.h
-; CHECK-NEXT:    cmphi p7.d, p0/z, z5.d, z1.d
-; CHECK-NEXT:    cmphi p0.d, p0/z, z5.d, z0.d
-; CHECK-NEXT:    uzp1 p4.s, p5.s, p4.s
-; CHECK-NEXT:    uzp1 p5.s, p9.s, p6.s
-; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Reload
-; CHECK-NEXT:    whilelo p6.b, xzr, x8
-; CHECK-NEXT:    uzp1 p3.s, p8.s, p3.s
 ; CHECK-NEXT:    cmp x9, #1
-; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p0.s, p0.s, p7.s
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p4.h, p5.h, p4.h
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p0.h, p0.h, p3.h
-; CHECK-NEXT:    uzp1 p1.b, p1.b, p2.b
-; CHECK-NEXT:    uzp1 p2.b, p0.b, p4.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT:    whilelo p3.b, xzr, x8
-; CHECK-NEXT:    sel p0.b, p1, p1.b, p6.b
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
-; CHECK-NEXT:    sel p1.b, p2, p2.b, p3.b
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    csinv x9, x9, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x9
+; CHECK-NEXT:    whilelo p1.b, x8, x9
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 32 x i1> @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 2)
@@ -266,31 +150,13 @@ entry:
 define <vscale x 8 x i1> @whilewr_32_expand(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_32_expand:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    subs x8, x1, x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x9, x8, #3
 ; CHECK-NEXT:    csel x8, x9, x8, mi
 ; CHECK-NEXT:    asr x8, x8, #2
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, x8
-; CHECK-NEXT:    incd z1.d
-; CHECK-NEXT:    incd z2.d, all, mul #2
-; CHECK-NEXT:    cmphi p1.d, p0/z, z3.d, z0.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    cmphi p2.d, p0/z, z3.d, z1.d
-; CHECK-NEXT:    cmphi p3.d, p0/z, z3.d, z2.d
-; CHECK-NEXT:    incd z4.d, all, mul #2
-; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
-; CHECK-NEXT:    cmphi p0.d, p0/z, z3.d, z4.d
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    uzp1 p0.s, p3.s, p0.s
-; CHECK-NEXT:    uzp1 p0.h, p1.h, p0.h
-; CHECK-NEXT:    whilelo p1.h, xzr, x8
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 8 x i1> @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 4)
@@ -300,59 +166,13 @@ entry:
 define <vscale x 16 x i1> @whilewr_32_expand2(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_32_expand2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    subs x8, x1, x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x9, x8, #3
 ; CHECK-NEXT:    csel x8, x9, x8, mi
 ; CHECK-NEXT:    asr x8, x8, #2
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    incd z1.d
-; CHECK-NEXT:    incd z4.d, all, mul #2
-; CHECK-NEXT:    incd z5.d, all, mul #4
-; CHECK-NEXT:    cmphi p2.d, p0/z, z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    cmphi p1.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    incd z1.d, all, mul #4
-; CHECK-NEXT:    cmphi p3.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    incd z4.d, all, mul #4
-; CHECK-NEXT:    cmphi p4.d, p0/z, z2.d, z5.d
-; CHECK-NEXT:    incd z3.d, all, mul #2
-; CHECK-NEXT:    cmphi p5.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    cmphi p7.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
-; CHECK-NEXT:    mov z0.d, z3.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z2.d, z3.d
-; CHECK-NEXT:    uzp1 p2.s, p4.s, p5.s
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT:    incd z0.d, all, mul #4
-; CHECK-NEXT:    uzp1 p3.s, p3.s, p6.s
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
-; CHECK-NEXT:    cmphi p0.d, p0/z, z2.d, z0.d
-; CHECK-NEXT:    uzp1 p1.h, p1.h, p3.h
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    uzp1 p0.s, p7.s, p0.s
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p0.h, p2.h, p0.h
-; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
-; CHECK-NEXT:    whilelo p1.b, xzr, x8
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 4)
@@ -362,93 +182,15 @@ entry:
 define <vscale x 32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_32_expand3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    subs x8, x1, x0
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    add x9, x8, #3
-; CHECK-NEXT:    incb x0, all, mul #4
-; CHECK-NEXT:    csel x8, x9, x8, mi
-; CHECK-NEXT:    asr x8, x8, #2
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, x8
-; CHECK-NEXT:    incd z1.d
-; CHECK-NEXT:    incd z2.d, all, mul #2
-; CHECK-NEXT:    incd z4.d, all, mul #4
-; CHECK-NEXT:    cmphi p5.d, p0/z, z5.d, z0.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z2.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    cmphi p2.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    cmphi p3.d, p0/z, z5.d, z2.d
-; CHECK-NEXT:    cmphi p4.d, p0/z, z5.d, z1.d
-; CHECK-NEXT:    incd z3.d, all, mul #2
-; CHECK-NEXT:    incd z6.d, all, mul #4
-; CHECK-NEXT:    incd z7.d, all, mul #4
-; CHECK-NEXT:    uzp1 p4.s, p5.s, p4.s
-; CHECK-NEXT:    mov z24.d, z3.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z5.d, z6.d
-; CHECK-NEXT:    cmphi p7.d, p0/z, z5.d, z7.d
-; CHECK-NEXT:    cmphi p8.d, p0/z, z5.d, z3.d
-; CHECK-NEXT:    incd z24.d, all, mul #4
-; CHECK-NEXT:    uzp1 p2.s, p2.s, p7.s
-; CHECK-NEXT:    uzp1 p3.s, p3.s, p8.s
-; CHECK-NEXT:    cmphi p9.d, p0/z, z5.d, z24.d
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    uzp1 p3.h, p4.h, p3.h
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    uzp1 p6.s, p6.s, p9.s
-; CHECK-NEXT:    whilelo p1.b, xzr, x8
-; CHECK-NEXT:    subs x8, x1, x0
-; CHECK-NEXT:    uzp1 p2.h, p2.h, p6.h
-; CHECK-NEXT:    add x9, x8, #3
-; CHECK-NEXT:    csel x8, x9, x8, mi
-; CHECK-NEXT:    uzp1 p2.b, p3.b, p2.b
-; CHECK-NEXT:    asr x8, x8, #2
-; CHECK-NEXT:    mov z5.d, x8
-; CHECK-NEXT:    cmphi p5.d, p0/z, z5.d, z24.d
-; CHECK-NEXT:    cmphi p7.d, p0/z, z5.d, z6.d
-; CHECK-NEXT:    cmphi p8.d, p0/z, z5.d, z7.d
-; CHECK-NEXT:    cmphi p9.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    cmphi p4.d, p0/z, z5.d, z3.d
-; CHECK-NEXT:    cmphi p10.d, p0/z, z5.d, z2.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z5.d, z1.d
-; CHECK-NEXT:    cmphi p0.d, p0/z, z5.d, z0.d
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    uzp1 p5.s, p7.s, p5.s
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    uzp1 p7.s, p9.s, p8.s
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p4.s, p10.s, p4.s
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p0.s, p0.s, p6.s
-; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p5.h, p7.h, p5.h
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p0.h, p0.h, p4.h
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
-; CHECK-NEXT:    whilelo p4.b, xzr, x8
-; CHECK-NEXT:    uzp1 p3.b, p0.b, p5.b
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
-; CHECK-NEXT:    sel p0.b, p2, p2.b, p1.b
-; CHECK-NEXT:    sel p1.b, p3, p3.b, p4.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    subs x9, x1, x0
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    add x10, x9, #3
+; CHECK-NEXT:    csel x9, x10, x9, mi
+; CHECK-NEXT:    asr x9, x9, #2
+; CHECK-NEXT:    cmp x9, #1
+; CHECK-NEXT:    csinv x9, x9, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x9
+; CHECK-NEXT:    whilelo p1.b, x8, x9
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 32 x i1> @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 4)
@@ -458,23 +200,13 @@ entry:
 define <vscale x 4 x i1> @whilewr_64_expand(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_64_expand:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    subs x8, x1, x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x9, x8, #7
 ; CHECK-NEXT:    csel x8, x9, x8, mi
 ; CHECK-NEXT:    asr x8, x8, #3
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    incd z1.d
-; CHECK-NEXT:    cmphi p1.d, p0/z, z2.d, z0.d
-; CHECK-NEXT:    cmphi p0.d, p0/z, z2.d, z1.d
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    uzp1 p0.s, p1.s, p0.s
-; CHECK-NEXT:    whilelo p1.s, xzr, x8
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 8)
@@ -484,31 +216,13 @@ entry:
 define <vscale x 8 x i1> @whilewr_64_expand2(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_64_expand2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    subs x8, x1, x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x9, x8, #7
 ; CHECK-NEXT:    csel x8, x9, x8, mi
 ; CHECK-NEXT:    asr x8, x8, #3
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, x8
-; CHECK-NEXT:    incd z1.d
-; CHECK-NEXT:    incd z2.d, all, mul #2
-; CHECK-NEXT:    cmphi p1.d, p0/z, z3.d, z0.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    cmphi p2.d, p0/z, z3.d, z1.d
-; CHECK-NEXT:    cmphi p3.d, p0/z, z3.d, z2.d
-; CHECK-NEXT:    incd z4.d, all, mul #2
-; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
-; CHECK-NEXT:    cmphi p0.d, p0/z, z3.d, z4.d
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    uzp1 p0.s, p3.s, p0.s
-; CHECK-NEXT:    uzp1 p0.h, p1.h, p0.h
-; CHECK-NEXT:    whilelo p1.h, xzr, x8
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 8 x i1> @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 8)
@@ -518,59 +232,13 @@ entry:
 define <vscale x 16 x i1> @whilewr_64_expand3(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_64_expand3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    subs x8, x1, x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x9, x8, #7
 ; CHECK-NEXT:    csel x8, x9, x8, mi
 ; CHECK-NEXT:    asr x8, x8, #3
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    incd z1.d
-; CHECK-NEXT:    incd z4.d, all, mul #2
-; CHECK-NEXT:    incd z5.d, all, mul #4
-; CHECK-NEXT:    cmphi p2.d, p0/z, z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    cmphi p1.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    incd z1.d, all, mul #4
-; CHECK-NEXT:    cmphi p3.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    incd z4.d, all, mul #4
-; CHECK-NEXT:    cmphi p4.d, p0/z, z2.d, z5.d
-; CHECK-NEXT:    incd z3.d, all, mul #2
-; CHECK-NEXT:    cmphi p5.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    cmphi p7.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
-; CHECK-NEXT:    mov z0.d, z3.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z2.d, z3.d
-; CHECK-NEXT:    uzp1 p2.s, p4.s, p5.s
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT:    incd z0.d, all, mul #4
-; CHECK-NEXT:    uzp1 p3.s, p3.s, p6.s
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
-; CHECK-NEXT:    cmphi p0.d, p0/z, z2.d, z0.d
-; CHECK-NEXT:    uzp1 p1.h, p1.h, p3.h
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    uzp1 p0.s, p7.s, p0.s
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p0.h, p2.h, p0.h
-; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
-; CHECK-NEXT:    whilelo p1.b, xzr, x8
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 8)
@@ -580,93 +248,15 @@ entry:
 define <vscale x 32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_64_expand4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    subs x8, x1, x0
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    add x9, x8, #7
-; CHECK-NEXT:    csel x8, x9, x8, mi
-; CHECK-NEXT:    addvl x9, x0, #8
-; CHECK-NEXT:    asr x8, x8, #3
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, x8
-; CHECK-NEXT:    incd z1.d
-; CHECK-NEXT:    incd z2.d, all, mul #2
-; CHECK-NEXT:    incd z4.d, all, mul #4
-; CHECK-NEXT:    cmphi p5.d, p0/z, z5.d, z0.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z2.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    cmphi p2.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    cmphi p3.d, p0/z, z5.d, z2.d
-; CHECK-NEXT:    cmphi p4.d, p0/z, z5.d, z1.d
-; CHECK-NEXT:    incd z3.d, all, mul #2
-; CHECK-NEXT:    incd z6.d, all, mul #4
-; CHECK-NEXT:    incd z7.d, all, mul #4
-; CHECK-NEXT:    uzp1 p4.s, p5.s, p4.s
-; CHECK-NEXT:    mov z24.d, z3.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z5.d, z6.d
-; CHECK-NEXT:    cmphi p7.d, p0/z, z5.d, z7.d
-; CHECK-NEXT:    cmphi p8.d, p0/z, z5.d, z3.d
-; CHECK-NEXT:    incd z24.d, all, mul #4
-; CHECK-NEXT:    uzp1 p2.s, p2.s, p7.s
-; CHECK-NEXT:    uzp1 p3.s, p3.s, p8.s
-; CHECK-NEXT:    cmphi p9.d, p0/z, z5.d, z24.d
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    uzp1 p3.h, p4.h, p3.h
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    uzp1 p6.s, p6.s, p9.s
-; CHECK-NEXT:    whilelo p1.b, xzr, x8
-; CHECK-NEXT:    subs x8, x1, x9
-; CHECK-NEXT:    uzp1 p2.h, p2.h, p6.h
-; CHECK-NEXT:    add x9, x8, #7
-; CHECK-NEXT:    csel x8, x9, x8, mi
-; CHECK-NEXT:    uzp1 p2.b, p3.b, p2.b
-; CHECK-NEXT:    asr x8, x8, #3
-; CHECK-NEXT:    mov z5.d, x8
-; CHECK-NEXT:    cmphi p5.d, p0/z, z5.d, z24.d
-; CHECK-NEXT:    cmphi p7.d, p0/z, z5.d, z6.d
-; CHECK-NEXT:    cmphi p8.d, p0/z, z5.d, z7.d
-; CHECK-NEXT:    cmphi p9.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    cmphi p4.d, p0/z, z5.d, z3.d
-; CHECK-NEXT:    cmphi p10.d, p0/z, z5.d, z2.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z5.d, z1.d
-; CHECK-NEXT:    cmphi p0.d, p0/z, z5.d, z0.d
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    uzp1 p5.s, p7.s, p5.s
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    uzp1 p7.s, p9.s, p8.s
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p4.s, p10.s, p4.s
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p0.s, p0.s, p6.s
-; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p5.h, p7.h, p5.h
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p0.h, p0.h, p4.h
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
-; CHECK-NEXT:    whilelo p4.b, xzr, x8
-; CHECK-NEXT:    uzp1 p3.b, p0.b, p5.b
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
-; CHECK-NEXT:    sel p0.b, p2, p2.b, p1.b
-; CHECK-NEXT:    sel p1.b, p3, p3.b, p4.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    subs x9, x1, x0
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    add x10, x9, #7
+; CHECK-NEXT:    csel x9, x10, x9, mi
+; CHECK-NEXT:    asr x9, x9, #3
+; CHECK-NEXT:    cmp x9, #1
+; CHECK-NEXT:    csinv x9, x9, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x9
+; CHECK-NEXT:    whilelo p1.b, x8, x9
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 32 x i1> @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 8)
@@ -706,60 +296,14 @@ entry:
 define <vscale x 16 x i1> @whilewr_badimm(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_badimm:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    mov x8, #6148914691236517205 // =0x5555555555555555
 ; CHECK-NEXT:    sub x9, x1, x0
 ; CHECK-NEXT:    movk x8, #21846
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    smulh x8, x9, x8
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    incd z1.d
 ; CHECK-NEXT:    add x8, x8, x8, lsr #63
-; CHECK-NEXT:    incd z4.d, all, mul #2
-; CHECK-NEXT:    incd z5.d, all, mul #4
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    cmphi p2.d, p0/z, z2.d, z0.d
-; CHECK-NEXT:    cmphi p1.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    incd z1.d, all, mul #4
-; CHECK-NEXT:    incd z3.d, all, mul #2
-; CHECK-NEXT:    cmphi p3.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    incd z4.d, all, mul #4
-; CHECK-NEXT:    cmphi p4.d, p0/z, z2.d, z5.d
-; CHECK-NEXT:    cmphi p5.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    mov z0.d, z3.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z2.d, z3.d
-; CHECK-NEXT:    cmphi p7.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
-; CHECK-NEXT:    incd z0.d, all, mul #4
-; CHECK-NEXT:    uzp1 p2.s, p4.s, p5.s
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p3.s, p3.s, p6.s
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT:    cmphi p0.d, p0/z, z2.d, z0.d
-; CHECK-NEXT:    uzp1 p1.h, p1.h, p3.h
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    uzp1 p0.s, p7.s, p0.s
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p0.h, p2.h, p0.h
-; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
-; CHECK-NEXT:    whilelo p1.b, xzr, x8
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 3)

diff  --git a/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll b/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll
index d62d0665dd332..541e312757369 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll
@@ -4,54 +4,10 @@
 define <vscale x 16 x i1> @whilewr_8(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Spill
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    sub x8, x1, x0
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    cmphi p1.d, p0/z, z2.d, z0.d
-; CHECK-NEXT:    incd z0.d, all, mul #4
-; CHECK-NEXT:    incd z1.d
-; CHECK-NEXT:    incd z3.d, all, mul #2
-; CHECK-NEXT:    cmphi p5.d, p0/z, z2.d, z0.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    cmphi p2.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    incd z1.d, all, mul #4
-; CHECK-NEXT:    cmphi p3.d, p0/z, z2.d, z3.d
-; CHECK-NEXT:    incd z3.d, all, mul #4
-; CHECK-NEXT:    incd z4.d, all, mul #2
-; CHECK-NEXT:    cmphi p6.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    cmphi p7.d, p0/z, z2.d, z3.d
-; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
-; CHECK-NEXT:    cmphi p4.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    incd z4.d, all, mul #4
-; CHECK-NEXT:    uzp1 p2.s, p5.s, p6.s
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
-; CHECK-NEXT:    cmphi p0.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    uzp1 p3.s, p3.s, p4.s
 ; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    uzp1 p1.h, p1.h, p3.h
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    uzp1 p0.s, p7.s, p0.s
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    uzp1 p0.h, p2.h, p0.h
-; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
-; CHECK-NEXT:    whilelo p1.b, xzr, x8
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    csinv x8, x8, xzr, ge
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 1)

diff  --git a/llvm/test/CodeGen/AArch64/loop-dependence-mask-ccmp.ll b/llvm/test/CodeGen/AArch64/loop-dependence-mask-ccmp.ll
deleted file mode 100644
index 2c5e351ee9ba7..0000000000000
--- a/llvm/test/CodeGen/AArch64/loop-dependence-mask-ccmp.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 -verify-machineinstrs -stop-after=finalize-isel %s -o - | FileCheck %s
-
-; Regression test for a bug where getTargetConstant(0) was used instead of
-; getConstant(0) in ScalarizeVecRes_LOOP_DEPENDENCE_MASK, causing instruction
-; selection to incorrectly generate CCMPXr (register form) with an immediate
-; operand instead of CCMPXi (immediate form).
-;
-
-define <1 x i1> @test_war_mask_ccmp(ptr %a, ptr %b) {
-  ; CHECK-LABEL: name: test_war_mask_ccmp
-  ; CHECK: bb.0.entry:
-  ; CHECK-NEXT:   liveins: $x0, $x1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x1
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x0
-  ; CHECK-NEXT:   [[SUBSXrr:%[0-9]+]]:gpr64common = SUBSXrr [[COPY]], [[COPY1]], implicit-def dead $nzcv
-  ; CHECK-NEXT:   [[ADDSXri:%[0-9]+]]:gpr64 = ADDSXri killed [[SUBSXrr]], 1, 0, implicit-def $nzcv
-  ; CHECK-NEXT:   [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 13, implicit $nzcv
-  ; CHECK-NEXT:   $w0 = COPY [[CSINCWr]]
-  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
-entry:
-  %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 1)
-  ret <1 x i1> %0
-}
-
-define <1 x i1> @test_raw_mask_ccmp(ptr %a, ptr %b) {
-  ; CHECK-LABEL: name: test_raw_mask_ccmp
-  ; CHECK: bb.0.entry:
-  ; CHECK-NEXT:   liveins: $x0, $x1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x1
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x0
-  ; CHECK-NEXT:   [[SUBSXrr:%[0-9]+]]:gpr64common = SUBSXrr [[COPY]], [[COPY1]], implicit-def dead $nzcv
-  ; CHECK-NEXT:   [[ADDSXri:%[0-9]+]]:gpr64 = ADDSXri killed [[SUBSXrr]], 1, 0, implicit-def $nzcv
-  ; CHECK-NEXT:   [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 13, implicit $nzcv
-  ; CHECK-NEXT:   $w0 = COPY [[CSINCWr]]
-  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
-entry:
-  %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 1)
-  ret <1 x i1> %0
-}
-
-declare <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr, ptr, i64)
-declare <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr, ptr, i64)


        


More information about the llvm-commits mailing list