[llvm-branch-commits] [llvm] 0e49a40 - [ARM] Cleanup for the MVETailPrediction pass

Thu Nov 26 07:24:11 PST 2020

Author: David Green
Date: 2020-11-26T15:10:44Z
New Revision: 0e49a40d756b4487aebea436f8f84411c1a629e7

URL: https://github.com/llvm/llvm-project/commit/0e49a40d756b4487aebea436f8f84411c1a629e7
DIFF: https://github.com/llvm/llvm-project/commit/0e49a40d756b4487aebea436f8f84411c1a629e7.diff

LOG: [ARM] Cleanup for the MVETailPrediction pass

This strips out a lot of the code that should no longer be needed from
the MVETailPredictionPass, leaving the important part - find active lane
mask instructions and convert them to VCTP operations.

Differential Revision: https://reviews.llvm.org/D91866

Added: 
    

Modified: 
    llvm/lib/Target/ARM/MVETailPredication.cpp
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
    llvm/test/CodeGen/Thumb2/active_lane_mask.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 25d5fd7e69c6..8055b5cf500d 100644

--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -22,23 +22,13 @@
 /// The HardwareLoops pass inserts intrinsics identifying loops that the
 /// backend will attempt to convert into a low-overhead loop. The vectorizer is
 /// responsible for generating a vectorized loop in which the lanes are
-/// predicated upon the iteration counter. This pass looks at these predicated
-/// vector loops, that are targets for low-overhead loops, and prepares it for
-/// code generation. Once the vectorizer has produced a masked loop, there's a
-/// couple of final forms:
-/// - A tail-predicated loop, with implicit predication.
-/// - A loop containing multiple VCPT instructions, predicating multiple VPT
-///   blocks of instructions operating on 
diff erent vector types.
-///
-/// This pass:
-/// 1) Checks if the predicates of the masked load/store instructions are
-///    generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes
-///    the the scalar loop tripcount as its second argument, which we extract
-///    to set up the number of elements processed by the loop.
-/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target
-///    specific VCTP intrinsic to represent the effect of tail predication.
-///    This will be picked up by the ARM Low-overhead loop pass, which performs
-///    the final transformation to a DLSTP or WLSTP tail-predicated loop.
+/// predicated upon an get.active.lane.mask intrinsic. This pass looks at these
+/// get.active.lane.mask intrinsic and attempts to convert them to VCTP
+/// instructions. This will be picked up by the ARM Low-overhead loop pass later
+/// in the backend, which performs the final transformation to a DLSTP or WLSTP
+/// tail-predicated loop.
+//
+//===----------------------------------------------------------------------===//
 
 #include "ARM.h"
 #include "ARMSubtarget.h"
@@ -57,6 +47,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
@@ -112,23 +103,18 @@ class MVETailPredication : public LoopPass {
   bool runOnLoop(Loop *L, LPPassManager&) override;
 
 private:
-  /// Perform the relevant checks on the loop and convert if possible.
-  bool TryConvert(Value *TripCount);
-
-  /// Return whether this is a vectorized loop, that contains masked
-  /// load/stores.
-  bool IsPredicatedVectorLoop();
+  /// Perform the relevant checks on the loop and convert active lane masks if
+  /// possible.
+  bool TryConvertActiveLaneMask(Value *TripCount);
 
   /// Perform several checks on the arguments of @llvm.get.active.lane.mask
   /// intrinsic. E.g., check that the loop induction variable and the element
   /// count are of the form we expect, and also perform overflow checks for
   /// the new expressions that are created.
-  bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
-                        FixedVectorType *VecTy);
+  bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount);
 
   /// Insert the intrinsic to represent the effect of tail predication.
-  void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
-                           FixedVectorType *VecTy);
+  void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount);
 
   /// Rematerialize the iteration count in exit blocks, which enables
   /// ARMLowOverheadLoops to better optimise away loop update statements inside
@@ -138,25 +124,6 @@ class MVETailPredication : public LoopPass {
 
 } // end namespace
 
-static bool IsDecrement(Instruction &I) {
-  auto *Call = dyn_cast<IntrinsicInst>(&I);
-  if (!Call)
-    return false;
-
-  Intrinsic::ID ID = Call->getIntrinsicID();
-  return ID == Intrinsic::loop_decrement_reg;
-}
-
-static bool IsMasked(Instruction *I) {
-  auto *Call = dyn_cast<IntrinsicInst>(I);
-  if (!Call)
-    return false;
-
-  Intrinsic::ID ID = Call->getIntrinsicID();
-  return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load ||
-         isGatherScatter(Call);
-}
-
 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   if (skipLoop(L) || !EnableTailPredication)
     return false;
@@ -207,147 +174,11 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
       return false;
   }
 
-  // Search for the hardware loop intrinic that decrements the loop counter.
-  IntrinsicInst *Decrement = nullptr;
-  for (auto *BB : L->getBlocks()) {
-    for (auto &I : *BB) {
-      if (IsDecrement(I)) {
-        Decrement = cast<IntrinsicInst>(&I);
-        break;
-      }
-    }
-  }
-
-  if (!Decrement)
-    return false;
-
-  LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
-             << *Decrement << "\n");
-
-  if (!TryConvert(Setup->getArgOperand(0))) {
-    LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
-    return false;
-  }
-
-  return true;
-}
-
-static FixedVectorType *getVectorType(IntrinsicInst *I) {
-  unsigned ID = I->getIntrinsicID();
-  FixedVectorType *VecTy;
-  if (ID == Intrinsic::masked_load || isGather(I)) {
-    if (ID == Intrinsic::arm_mve_vldr_gather_base_wb ||
-        ID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated)
-      // then the type is a StructType
-      VecTy = dyn_cast<FixedVectorType>(I->getType()->getContainedType(0));
-    else
-      VecTy = dyn_cast<FixedVectorType>(I->getType());
-  } else if (ID == Intrinsic::masked_store) {
-    VecTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
-  } else {
-    VecTy = dyn_cast<FixedVectorType>(I->getOperand(2)->getType());
-  }
-  assert(VecTy && "No scalable vectors expected here");
-  return VecTy;
-}
-
-bool MVETailPredication::IsPredicatedVectorLoop() {
-  // Check that the loop contains at least one masked load/store intrinsic.
-  // We only support 'normal' vector instructions - other than masked
-  // load/stores.
-  bool ActiveLaneMask = false;
-  for (auto *BB : L->getBlocks()) {
-    for (auto &I : *BB) {
-      auto *Int = dyn_cast<IntrinsicInst>(&I);
-      if (!Int)
-        continue;
-
-      switch (Int->getIntrinsicID()) {
-      case Intrinsic::get_active_lane_mask:
-        ActiveLaneMask = true;
-        continue;
-      case Intrinsic::sadd_sat:
-      case Intrinsic::uadd_sat:
-      case Intrinsic::ssub_sat:
-      case Intrinsic::usub_sat:
-      case Intrinsic::vector_reduce_add:
-        continue;
-      case Intrinsic::fma:
-      case Intrinsic::trunc:
-      case Intrinsic::rint:
-      case Intrinsic::round:
-      case Intrinsic::floor:
-      case Intrinsic::ceil:
-      case Intrinsic::fabs:
-        if (ST->hasMVEFloatOps())
-          continue;
-        break;
-      default:
-        break;
-      }
-      if (IsMasked(&I)) {
-        auto *VecTy = getVectorType(Int);
-        unsigned Lanes = VecTy->getNumElements();
-        unsigned ElementWidth = VecTy->getScalarSizeInBits();
-        // MVE vectors are 128-bit, but don't support 128 x i1.
-        // TODO: Can we support vectors larger than 128-bits?
-        unsigned MaxWidth = TTI->getRegisterBitWidth(true);
-        if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
-          return false;
-        MaskedInsts.push_back(cast<IntrinsicInst>(&I));
-        continue;
-      }
-
-      for (const Use &U : Int->args()) {
-        if (isa<VectorType>(U->getType()))
-          return false;
-      }
-    }
-  }
-
-  if (!ActiveLaneMask) {
-    LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n");
-    return false;
-  }
-  return !MaskedInsts.empty();
-}
-
-// Look through the exit block to see whether there's a duplicate predicate
-// instruction. This can happen when we need to perform a select on values
-// from the last and previous iteration. Instead of doing a straight
-// replacement of that predicate with the vctp, clone the vctp and place it
-// in the block. This means that the VPR doesn't have to be live into the
-// exit block which should make it easier to convert this loop into a proper
-// tail predicated loop.
-static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
-  BasicBlock *Exit = L->getUniqueExitBlock();
-  if (!Exit) {
-    LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
-    return;
-  }
-
-  // Drop references and add operands to check for dead.
-  SmallPtrSet<Instruction*, 4> Dead;
-  while (!MaybeDead.empty()) {
-    auto *I = MaybeDead.front();
-    MaybeDead.remove(I);
-    if (I->hasNUsesOrMore(1))
-      continue;
+  LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n");
 
-    for (auto &U : I->operands())
-      if (auto *OpI = dyn_cast<Instruction>(U))
-        MaybeDead.insert(OpI);
+  bool Changed = TryConvertActiveLaneMask(Setup->getArgOperand(0));
 
-    Dead.insert(I);
-  }
-
-  for (auto *I : Dead) {
-    LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump());
-    I->eraseFromParent();
-  }
-
-  for (auto I : L->blocks())
-    DeleteDeadPHIs(I);
+  return Changed;
 }
 
 // The active lane intrinsic has this form:
@@ -368,7 +199,7 @@ static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
 // 3) The IV must be an induction phi with an increment equal to the
 //    vector width.
 bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
-    Value *TripCount, FixedVectorType *VecTy) {
+                                          Value *TripCount) {
   bool ForceTailPredication =
     EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
     EnableTailPredication == TailPredication::ForceEnabled;
@@ -376,7 +207,10 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
   Value *ElemCount = ActiveLaneMask->getOperand(1);
   auto *EC= SE->getSCEV(ElemCount);
   auto *TC = SE->getSCEV(TripCount);
-  int VectorWidth = VecTy->getNumElements();
+  int VectorWidth =
+      cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements();
+  if (VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16)
+    return false;
   ConstantInt *ConstElemCount = nullptr;
 
   // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to
@@ -503,21 +337,22 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
   if (VectorWidth == StepValue)
     return true;
 
-  LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "
-             "vector width " << VectorWidth << "\n");
+  LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue
+                    << " doesn't match vector width " << VectorWidth << "\n");
 
   return false;
 }
 
 void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
-    Value *TripCount, FixedVectorType *VecTy) {
+                                             Value *TripCount) {
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   Module *M = L->getHeader()->getModule();
   Type *Ty = IntegerType::get(M->getContext(), 32);
-  unsigned VectorWidth = VecTy->getNumElements();
+  unsigned VectorWidth =
+      cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements();
 
   // Insert a phi to count the number of elements processed by the loop.
-  Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI()  );
+  Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI());
   PHINode *Processed = Builder.CreatePHI(Ty, 2);
   Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader());
 
@@ -553,50 +388,36 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
              << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n");
 }
 
-bool MVETailPredication::TryConvert(Value *TripCount) {
-  if (!IsPredicatedVectorLoop()) {
-    LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop.\n");
+bool MVETailPredication::TryConvertActiveLaneMask(Value *TripCount) {
+  SmallVector<IntrinsicInst *, 4> ActiveLaneMasks;
+  for (auto *BB : L->getBlocks())
+    for (auto &I : *BB)
+      if (auto *Int = dyn_cast<IntrinsicInst>(&I))
+        if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask)
+          ActiveLaneMasks.push_back(Int);
+
+  if (ActiveLaneMasks.empty())
     return false;
-  }
 
   LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
-  SetVector<Instruction*> Predicates;
-
-  auto getPredicateOp = [](IntrinsicInst *I) {
-    unsigned IntrinsicID = I->getIntrinsicID();
-    if (IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated ||
-        IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated)
-      return 5;
-    return (IntrinsicID == Intrinsic::masked_load || isGather(I)) ? 2 : 3;
-  };
-
-  // Walk through the masked intrinsics and try to find whether the predicate
-  // operand is generated by intrinsic @llvm.get.active.lane.mask().
-  for (auto *I : MaskedInsts) {
-    Value *PredOp = I->getArgOperand(getPredicateOp(I));
-    auto *Predicate = dyn_cast<Instruction>(PredOp);
-    if (!Predicate || Predicates.count(Predicate))
-      continue;
 
-    auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
-    if (!ActiveLaneMask ||
-        ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
-      continue;
-
-    Predicates.insert(Predicate);
+  for (auto *ActiveLaneMask : ActiveLaneMasks) {
     LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
                       << *ActiveLaneMask << "\n");
 
-    auto *VecTy = getVectorType(I);
-    if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
+    if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) {
       LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
       return false;
     }
     LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n");
-    InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy);
+    InsertVCTPIntrinsic(ActiveLaneMask, TripCount);
   }
 
-  Cleanup(Predicates, L);
+  // Remove dead instructions and now dead phis.
+  for (auto *II : ActiveLaneMasks)
+    RecursivelyDeleteTriviallyDeadInstructions(II);
+  for (auto I : L->blocks())
+    DeleteDeadPHIs(I);
   return true;
 }
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
index 5c97f1b953cf..bd927fdcf859 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
@@ -241,42 +241,18 @@ define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA,
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
-; CHECK-NEXT:    adds r3, r2, #3
-; CHECK-NEXT:    vdup.32 q1, r2
-; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    sub.w r12, r3, #4
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    adr r3, .LCPI5_0
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    vldrw.u32 q0, [r3]
-; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vadd.i32 q2, q0, r12
-; CHECK-NEXT:    vdup.32 q3, r12
-; CHECK-NEXT:    vcmp.u32 hi, q3, q2
-; CHECK-NEXT:    add.w r12, r12, #4
-; CHECK-NEXT:    vpnot
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vcmpt.u32 hi, q1, q2
-; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
-; CHECK-NEXT:    vrintr.f32 s15, s11
-; CHECK-NEXT:    vrintr.f32 s14, s10
-; CHECK-NEXT:    vrintr.f32 s13, s9
-; CHECK-NEXT:    vrintr.f32 s12, s8
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrwt.32 q3, [r1], #16
-; CHECK-NEXT:    le lr, .LBB5_2
+; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
+; CHECK-NEXT:    vrintr.f32 s7, s3
+; CHECK-NEXT:    vrintr.f32 s6, s2
+; CHECK-NEXT:    vrintr.f32 s5, s1
+; CHECK-NEXT:    vrintr.f32 s4, s0
+; CHECK-NEXT:    vstrw.32 q1, [r1], #16
+; CHECK-NEXT:    letp lr, .LBB5_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
-; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  @ %bb.4:
-; CHECK-NEXT:  .LCPI5_0:
-; CHECK-NEXT:    .long 0 @ 0x0
-; CHECK-NEXT:    .long 1 @ 0x1
-; CHECK-NEXT:    .long 2 @ 0x2
-; CHECK-NEXT:    .long 3 @ 0x3
 entry:
   %cmp5 = icmp eq i32 %n, 0
   br i1 %cmp5, label %for.cond.cleanup, label %vector.ph

diff  --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index 2a5d32013d47..729493163b81 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -410,7 +410,156 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
   ret <16 x i8> %select
 }
 
+define void @test_width2(i32* nocapture readnone %x, i32* nocapture %y, i8 zeroext %m) {
+; CHECK-LABEL: test_width2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    beq.w .LBB4_3
+; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    adds r0, r2, #1
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    bic r0, r0, #1
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    subs r0, #2
+; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    add.w lr, r3, r0, lsr #1
+; CHECK-NEXT:    adr r3, .LCPI4_0
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    vldrw.u32 q1, [r3]
+; CHECK-NEXT:    vand q2, q2, q0
+; CHECK-NEXT:  .LBB4_2: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmov.32 q3[0], r6
+; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    vmov.32 q3[2], r6
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vand q3, q3, q0
+; CHECK-NEXT:    adds r6, #2
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    adds r3, #1
+; CHECK-NEXT:    adc r12, r2, #0
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vand q3, q3, q0
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    teq.w r4, r2
+; CHECK-NEXT:    cset r2, ne
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q4[0], r2
+; CHECK-NEXT:    vmov.32 q4[1], r2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    eors r3, r2
+; CHECK-NEXT:    orrs.w r3, r3, r12
+; CHECK-NEXT:    cset r3, ne
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    subs r5, r4, r5
+; CHECK-NEXT:    vmov.32 q4[2], r3
+; CHECK-NEXT:    vmov r5, s10
+; CHECK-NEXT:    vmov.32 q4[3], r3
+; CHECK-NEXT:    vmov r3, s13
+; CHECK-NEXT:    veor q4, q4, q1
+; CHECK-NEXT:    sbcs.w r0, r3, r0
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r0, #1
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    subs r2, r2, r5
+; CHECK-NEXT:    vmov.32 q5[0], r0
+; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    @ implicit-def: $q3
+; CHECK-NEXT:    sbcs r0, r3
+; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r0, #1
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.32 q5[3], r0
+; CHECK-NEXT:    vand q4, q4, q5
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    and r2, r2, #1
+; CHECK-NEXT:    orr.w r3, r2, r0, lsl #1
+; CHECK-NEXT:    sub.w r2, r1, #8
+; CHECK-NEXT:    lsls r0, r3, #31
+; CHECK-NEXT:    itt ne
+; CHECK-NEXT:    ldrne r0, [r2]
+; CHECK-NEXT:    vmovne.32 q3[0], r0
+; CHECK-NEXT:    and r0, r3, #3
+; CHECK-NEXT:    lsls r0, r0, #30
+; CHECK-NEXT:    itt mi
+; CHECK-NEXT:    ldrmi r0, [r2, #4]
+; CHECK-NEXT:    vmovmi.32 q3[2], r0
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    and r2, r2, #1
+; CHECK-NEXT:    orr.w r2, r2, r0, lsl #1
+; CHECK-NEXT:    lsls r0, r2, #31
+; CHECK-NEXT:    itt ne
+; CHECK-NEXT:    vmovne r0, s12
+; CHECK-NEXT:    strne r0, [r1]
+; CHECK-NEXT:    and r0, r2, #3
+; CHECK-NEXT:    lsls r0, r0, #30
+; CHECK-NEXT:    itt mi
+; CHECK-NEXT:    vmovmi r0, s14
+; CHECK-NEXT:    strmi r0, [r1, #4]
+; CHECK-NEXT:    adds r1, #8
+; CHECK-NEXT:    le lr, .LBB4_2
+; CHECK-NEXT:  .LBB4_3: @ %for.cond.cleanup
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.4:
+; CHECK-NEXT:  .LCPI4_0:
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 0 @ 0x0
+entry:
+  %cmp9.not = icmp eq i8 %m, 0
+  br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i8 %m to i32
+  %n.rnd.up = add nuw nsw i32 %wide.trip.count, 1
+  %n.vec = and i32 %n.rnd.up, 510
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %for.body.preheader
+  %index = phi i32 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %wide.trip.count)
+  %0 = add nsw i32 %index, -2
+  %1 = getelementptr inbounds i32, i32* %y, i32 %0
+  %2 = bitcast i32* %1 to <2 x i32>*
+  %wide.masked.load = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %2, i32 4, <2 x i1> %active.lane.mask, <2 x i32> undef)
+  %3 = getelementptr inbounds i32, i32* %y, i32 %index
+  %4 = bitcast i32* %3 to <2 x i32>*
+  call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %wide.masked.load, <2 x i32>* %4, i32 4, <2 x i1> %active.lane.mask)
+  %index.next = add i32 %index, 2
+  %5 = icmp eq i32 %index.next, %n.vec
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+declare <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <7 x i1> @llvm.get.active.lane.mask.v7i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
+declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
+declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)