[llvm] 89e1248 - [ARM][MVE] Optimise offset addresses of gathers/scatters

Wed Apr 8 03:47:20 PDT 2020

Author: Anna Welker
Date: 2020-04-08T11:46:57+01:00
New Revision: 89e1248d7b76886912f499391719e68b27e42ec3

URL: https://github.com/llvm/llvm-project/commit/89e1248d7b76886912f499391719e68b27e42ec3
DIFF: https://github.com/llvm/llvm-project/commit/89e1248d7b76886912f499391719e68b27e42ec3.diff

LOG: [ARM][MVE] Optimise offset addresses of gathers/scatters

This patch adds an analysis of the offset addresses used by gathers
and scatters to the MVEGatherScatterLowering pass to find
multiplications and additions that are loop invariant and thus can
be moved into the loop preheader, avoiding to execute them each time.

Differential Revision: https://reviews.llvm.org/D76681

Added: 
    llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
    llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll

Modified: 
    llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
    llvm/test/CodeGen/ARM/O3-pipeline.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index b90440b700f8..cc5970573164 100644

--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 
@@ -67,6 +68,7 @@ class MVEGatherScatterLowering : public FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<TargetPassConfig>();
+    AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 
@@ -83,7 +85,7 @@ class MVEGatherScatterLowering : public FunctionPass {
   // Compute the scale of this gather/scatter instruction
   int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
 
-  bool lowerGather(IntrinsicInst *I);
+  Value *lowerGather(IntrinsicInst *I);
   // Create a gather from a base + vector of offsets
   Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr,
                                      Instruction *&Root, IRBuilder<> &Builder);
@@ -91,13 +93,22 @@ class MVEGatherScatterLowering : public FunctionPass {
   Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr,
                                    IRBuilder<> &Builder);
 
-  bool lowerScatter(IntrinsicInst *I);
+  Value *lowerScatter(IntrinsicInst *I);
   // Create a scatter to a base + vector of offsets
-  Value *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Ptr,
+  Value *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Offsets,
                                       IRBuilder<> &Builder);
   // Create a scatter to a vector of pointers
   Value *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr,
                                     IRBuilder<> &Builder);
+
+  // Check whether these offsets could be moved out of the loop they're in
+  bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
+  // Pushes the given add out of the loop
+  void pushOutAdd(PHINode *&Phi, Value *OffsSecondOperand, unsigned StartIndex);
+  // Pushes the given mul out of the loop
+  void pushOutMul(PHINode *&Phi, Value *IncrementPerRound,
+                  Value *OffsSecondOperand, unsigned LoopIncrement,
+                  IRBuilder<> &Builder);
 };
 
 } // end anonymous namespace
@@ -205,7 +216,7 @@ int MVEGatherScatterLowering::computeScale(unsigned GEPElemSize,
   return -1;
 }
 
-bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
+Value *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
   using namespace PatternMatch;
   LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n");
 
@@ -220,7 +231,7 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
 
   if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(),
                                Ty->getScalarSizeInBits(), Alignment))
-    return false;
+    return nullptr;
   lookThroughBitcast(Ptr);
   assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
 
@@ -233,7 +244,7 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
   if (!Load)
     Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
   if (!Load)
-    return false;
+    return nullptr;
 
   if (!isa<UndefValue>(PassThru) && !match(PassThru, m_Zero())) {
     LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - "
@@ -247,12 +258,14 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
     // If this was an extending gather, we need to get rid of the sext/zext
     // sext/zext as well as of the gather itself
     I->eraseFromParent();
+
   LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n");
-  return true;
+  return Load;
 }
 
-Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(
-    IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
+Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(IntrinsicInst *I,
+                                                           Value *Ptr,
+                                                           IRBuilder<> &Builder) {
   using namespace PatternMatch;
   Type *Ty = I->getType();
   LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n");
@@ -287,7 +300,7 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
     if (!I->hasOneUse())
       return nullptr;
 
-    // The correct root to replace is the not the CallInst itself, but the
+    // The correct root to replace is not the CallInst itself, but the
     // instruction which extends it
     Extend = cast<Instruction>(*I->users().begin());
     if (isa<SExtInst>(Extend)) {
@@ -334,7 +347,7 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
          Builder.getInt32(Scale), Builder.getInt32(Unsigned)});
 }
 
-bool MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
+Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
   using namespace PatternMatch;
   LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n");
 
@@ -348,7 +361,7 @@ bool MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
 
   if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(),
                                Ty->getScalarSizeInBits(), Alignment))
-    return false;
+    return nullptr;
   lookThroughBitcast(Ptr);
   assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
 
@@ -360,12 +373,12 @@ bool MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
   if (!Store)
     Store = tryCreateMaskedScatterBase(I, Ptr, Builder);
   if (!Store)
-    return false;
+    return nullptr;
 
   LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n");
   I->replaceAllUsesWith(Store);
   I->eraseFromParent();
-  return true;
+  return Store;
 }
 
 Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase(
@@ -445,6 +458,263 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
          Builder.getInt32(Scale)});
 }
 
+void MVEGatherScatterLowering::pushOutAdd(PHINode *&Phi,
+                                          Value *OffsSecondOperand,
+                                          unsigned StartIndex) {
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising add instruction\n");
+  Instruction *InsertionPoint;
+  if (isa<Instruction>(OffsSecondOperand))
+    InsertionPoint = &cast<Instruction>(OffsSecondOperand)->getParent()->back();
+  else
+    InsertionPoint =
+        &cast<Instruction>(Phi->getIncomingBlock(StartIndex)->back());
+  // Initialize the phi with a vector that contains a sum of the constants
+  Instruction *NewIndex = BinaryOperator::Create(
+      Instruction::Add, Phi->getIncomingValue(StartIndex), OffsSecondOperand,
+      "PushedOutAdd", InsertionPoint);
+  unsigned IncrementIndex = StartIndex == 0 ? 1 : 0;
+
+  // Order such that start index comes first (this reduces mov's)
+  Phi->addIncoming(NewIndex, Phi->getIncomingBlock(StartIndex));
+  Phi->addIncoming(Phi->getIncomingValue(IncrementIndex),
+                   Phi->getIncomingBlock(IncrementIndex));
+  Phi->removeIncomingValue(IncrementIndex);
+  Phi->removeIncomingValue(StartIndex);
+}
+
+void MVEGatherScatterLowering::pushOutMul(PHINode *&Phi,
+                                          Value *IncrementPerRound,
+                                          Value *OffsSecondOperand,
+                                          unsigned LoopIncrement,
+                                          IRBuilder<> &Builder) {
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising mul instruction\n");
+
+  // Create a new scalar add outside of the loop and transform it to a splat
+  // by which loop variable can be incremented
+  Instruction *InsertionPoint;
+  if (isa<Instruction>(OffsSecondOperand))
+    InsertionPoint = &cast<Instruction>(OffsSecondOperand)->getParent()->back();
+  else
+    InsertionPoint = &cast<Instruction>(
+        Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1)->back());
+
+  // Create a new index
+  Value *StartIndex = BinaryOperator::Create(
+      Instruction::Mul, Phi->getIncomingValue(LoopIncrement == 1 ? 0 : 1),
+      OffsSecondOperand, "PushedOutMul", InsertionPoint);
+
+  Instruction *Product =
+      BinaryOperator::Create(Instruction::Mul, IncrementPerRound,
+                             OffsSecondOperand, "Product", InsertionPoint);
+  // Increment NewIndex by Product instead of the multiplication
+  Instruction *NewIncrement = BinaryOperator::Create(
+      Instruction::Add, Phi, Product, "IncrementPushedOutMul",
+      cast<Instruction>(Phi->getIncomingBlock(LoopIncrement)->back())
+          .getPrevNode());
+
+  Phi->addIncoming(StartIndex,
+                   Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1));
+  Phi->addIncoming(NewIncrement, Phi->getIncomingBlock(LoopIncrement));
+  Phi->removeIncomingValue((unsigned)0);
+  Phi->removeIncomingValue((unsigned)0);
+  return;
+}
+
+// Return true if the given intrinsic is a gather or scatter
+bool isGatherScatter(IntrinsicInst *IntInst) {
+  if (IntInst == nullptr)
+    return false;
+  unsigned IntrinsicID = IntInst->getIntrinsicID();
+  return (IntrinsicID == Intrinsic::masked_gather ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_base ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_predicated ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated ||
+          IntrinsicID == Intrinsic::masked_scatter ||
+          IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base ||
+          IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated ||
+          IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb ||
+          IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb_predicated ||
+          IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset ||
+          IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated);
+}
+
+// Check whether all usages of this instruction are as offsets of
+// gathers/scatters or simple arithmetics only used by gathers/scatters
+bool hasAllGatScatUsers(Instruction *I) {
+  if (I->hasNUses(0)) {
+    return false;
+  }
+  bool Gatscat = true;
+  for (User *U : I->users()) {
+    if (!isa<Instruction>(U))
+      return false;
+    if (isa<GetElementPtrInst>(U) ||
+        isGatherScatter(dyn_cast<IntrinsicInst>(U))) {
+      return Gatscat;
+    } else {
+      unsigned OpCode = cast<Instruction>(U)->getOpcode();
+      if ((OpCode == Instruction::Add || OpCode == Instruction::Mul) &&
+          hasAllGatScatUsers(cast<Instruction>(U))) {
+        continue;
+      }
+      return false;
+    }
+  }
+  return Gatscat;
+}
+
+bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
+                                               LoopInfo *LI) {
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to optimize\n");
+  // Optimise the addresses of gathers/scatters by moving invariant
+  // calculations out of the loop
+  if (!isa<Instruction>(Offsets))
+    return false;
+  Instruction *Offs = cast<Instruction>(Offsets);
+  if (Offs->getOpcode() != Instruction::Add &&
+      Offs->getOpcode() != Instruction::Mul)
+    return false;
+  Loop *L = LI->getLoopFor(BB);
+  if (L == nullptr)
+    return false;
+  if (!Offs->hasOneUse()) {
+    if (!hasAllGatScatUsers(Offs))
+      return false;
+  }
+
+  // Find out which, if any, operand of the instruction
+  // is a phi node
+  PHINode *Phi;
+  int OffsSecondOp;
+  if (isa<PHINode>(Offs->getOperand(0))) {
+    Phi = cast<PHINode>(Offs->getOperand(0));
+    OffsSecondOp = 1;
+  } else if (isa<PHINode>(Offs->getOperand(1))) {
+    Phi = cast<PHINode>(Offs->getOperand(1));
+    OffsSecondOp = 0;
+  } else {
+    bool Changed = true;
+    if (isa<Instruction>(Offs->getOperand(0)) &&
+        L->contains(cast<Instruction>(Offs->getOperand(0))))
+      Changed |= optimiseOffsets(Offs->getOperand(0), BB, LI);
+    if (isa<Instruction>(Offs->getOperand(1)) &&
+        L->contains(cast<Instruction>(Offs->getOperand(1))))
+      Changed |= optimiseOffsets(Offs->getOperand(1), BB, LI);
+    if (!Changed) {
+      return false;
+    } else {
+      if (isa<PHINode>(Offs->getOperand(0))) {
+        Phi = cast<PHINode>(Offs->getOperand(0));
+        OffsSecondOp = 1;
+      } else if (isa<PHINode>(Offs->getOperand(1))) {
+        Phi = cast<PHINode>(Offs->getOperand(1));
+        OffsSecondOp = 0;
+      } else {
+        return false;
+      }
+    }
+  }
+  // A phi node we want to perform this function on should be from the
+  // loop header, and shouldn't have more than 2 incoming values
+  if (Phi->getParent() != L->getHeader() ||
+      Phi->getNumIncomingValues() != 2)
+    return false;
+
+  // The phi must be an induction variable
+  Instruction *Op;
+  int IncrementingBlock = -1;
+
+  for (int i = 0; i < 2; i++)
+    if ((Op = dyn_cast<Instruction>(Phi->getIncomingValue(i))) != nullptr)
+      if (Op->getOpcode() == Instruction::Add &&
+          (Op->getOperand(0) == Phi || Op->getOperand(1) == Phi))
+        IncrementingBlock = i;
+  if (IncrementingBlock == -1)
+    return false;
+
+  Instruction *IncInstruction =
+      cast<Instruction>(Phi->getIncomingValue(IncrementingBlock));
+
+  // If the phi is not used by anything else, we can just adapt it when
+  // replacing the instruction; if it is, we'll have to duplicate it
+  PHINode *NewPhi;
+  Value *IncrementPerRound = IncInstruction->getOperand(
+      (IncInstruction->getOperand(0) == Phi) ? 1 : 0);
+
+  // Get the value that is added to/multiplied with the phi
+  Value *OffsSecondOperand = Offs->getOperand(OffsSecondOp);
+
+  if (IncrementPerRound->getType() != OffsSecondOperand->getType())
+    // Something has gone wrong, abort
+    return false;
+
+  // Only proceed if the increment per round is a constant or an instruction
+  // which does not originate from within the loop
+  if (!isa<Constant>(IncrementPerRound) &&
+      !(isa<Instruction>(IncrementPerRound) &&
+        !L->contains(cast<Instruction>(IncrementPerRound))))
+    return false;
+
+  if (Phi->getNumUses() == 2) {
+    // No other users -> reuse existing phi (One user is the instruction
+    // we're looking at, the other is the phi increment)
+    if (IncInstruction->getNumUses() != 1) {
+      // If the incrementing instruction does have more users than
+      // our phi, we need to copy it
+      IncInstruction = BinaryOperator::Create(
+          Instruction::BinaryOps(IncInstruction->getOpcode()), Phi,
+          IncrementPerRound, "LoopIncrement", IncInstruction);
+      Phi->setIncomingValue(IncrementingBlock, IncInstruction);
+    }
+    NewPhi = Phi;
+  } else {
+    // There are other users -> create a new phi
+    NewPhi = PHINode::Create(Phi->getType(), 0, "NewPhi", Phi);
+    std::vector<Value *> Increases;
+    // Copy the incoming values of the old phi
+    NewPhi->addIncoming(Phi->getIncomingValue(IncrementingBlock == 1 ? 0 : 1),
+                        Phi->getIncomingBlock(IncrementingBlock == 1 ? 0 : 1));
+    IncInstruction = BinaryOperator::Create(
+        Instruction::BinaryOps(IncInstruction->getOpcode()), NewPhi,
+        IncrementPerRound, "LoopIncrement", IncInstruction);
+    NewPhi->addIncoming(IncInstruction,
+                        Phi->getIncomingBlock(IncrementingBlock));
+    IncrementingBlock = 1;
+  }
+
+  IRBuilder<> Builder(BB->getContext());
+  Builder.SetInsertPoint(Phi);
+  Builder.SetCurrentDebugLocation(Offs->getDebugLoc());
+
+  switch (Offs->getOpcode()) {
+  case Instruction::Add:
+    pushOutAdd(NewPhi, OffsSecondOperand, IncrementingBlock == 1 ? 0 : 1);
+    break;
+  case Instruction::Mul:
+    pushOutMul(NewPhi, IncrementPerRound, OffsSecondOperand, IncrementingBlock,
+               Builder);
+    break;
+  default:
+    return false;
+  }
+  LLVM_DEBUG(
+      dbgs() << "masked gathers/scatters: simplified loop variable add/mul\n");
+
+  // The instruction has now been "absorbed" into the phi value
+  Offs->replaceAllUsesWith(NewPhi);
+  if (Offs->hasNUses(0))
+    Offs->eraseFromParent();
+  // Clean up the old increment in case it's unused because we built a new
+  // one
+  if (IncInstruction->hasNUses(0))
+    IncInstruction->eraseFromParent();
+
+  return true;
+}
+
 bool MVEGatherScatterLowering::runOnFunction(Function &F) {
   if (!EnableMaskedGatherScatters)
     return false;
@@ -455,6 +725,8 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
     return false;
   SmallVector<IntrinsicInst *, 4> Gathers;
   SmallVector<IntrinsicInst *, 4> Scatters;
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
@@ -466,10 +738,30 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
   }
 
   bool Changed = false;
-  for (IntrinsicInst *I : Gathers)
-    Changed |= lowerGather(I);
-  for (IntrinsicInst *I : Scatters)
-    Changed |= lowerScatter(I);
+  for (unsigned i = 0; i < Gathers.size(); i++) {
+    IntrinsicInst *I = Gathers[i];
+    if (isa<GetElementPtrInst>(I->getArgOperand(0)))
+      optimiseOffsets(cast<Instruction>(I->getArgOperand(0))->getOperand(1),
+                      I->getParent(), &LI);
+    Value *L = lowerGather(I);
+    if (L == nullptr)
+      continue;
+    // Get rid of any now dead instructions
+    SimplifyInstructionsInBlock(cast<Instruction>(L)->getParent());
+    Changed = true;
+  }
 
+  for (unsigned i = 0; i < Scatters.size(); i++) {
+    IntrinsicInst *I = Scatters[i];
+    if (isa<GetElementPtrInst>(I->getArgOperand(1)))
+      optimiseOffsets(cast<Instruction>(I->getArgOperand(1))->getOperand(1),
+                      I->getParent(), &LI);
+    Value *S = lowerScatter(I);
+    if (S == nullptr)
+      continue;
+    // Get rid of any now dead instructions
+    SimplifyInstructionsInBlock(cast<Instruction>(S)->getParent());
+    Changed = true;
+  }
   return Changed;
 }

diff  --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 6c1a1b22f7f1..cfe15a72a846 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -7,11 +7,11 @@
 ; CHECK-NEXT:    FunctionPass Manager
 ; CHECK-NEXT:      Expand Atomic instructions
 ; CHECK-NEXT:      Simplify the CFG
-; CHECK-NEXT:      MVE gather/scatter lowering
 ; CHECK-NEXT:      Dominator Tree Construction
+; CHECK-NEXT:      Natural Loop Information
+; CHECK-NEXT:      MVE gather/scatter lowering
 ; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:      Module Verifier
-; CHECK-NEXT:      Natural Loop Information
 ; CHECK-NEXT:      Canonicalize natural loops
 ; CHECK-NEXT:      Scalar Evolution Analysis
 ; CHECK-NEXT:      Loop Pass Manager

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll b/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
new file mode 100644
index 000000000000..a86a89972cf7
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+
+
+; RUN: opt --mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -S -o 2>/dev/null - | FileCheck %s
+
+define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
+; CHECK-LABEL: @push_out_add_sub_block(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1
+; CHECK-NEXT:    [[PUSHEDOUTADD:%.*]] = add <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50
+; CHECK-NEXT:    br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
+; CHECK:       lower.block:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    br label [[VECTOR_BODY_END]]
+; CHECK:       vector.body.end:
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+
+vector.ph:
+  %ind.end = shl i32 %n.vec, 1
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
+  %0 = icmp eq i32 %index, 50
+  br i1 %0, label %lower.block, label %end
+
+lower.block:                             ; preds = %vector.body
+  %1 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
+  %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %3 = getelementptr inbounds i32, i32* %dst, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  br label %vector.body.end
+
+vector.body.end:                             ; preds = %lower.block
+  %5 = icmp eq i32 %index.next, %n.vec
+  br i1 %5, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
+; CHECK-LABEL: @push_out_mul_sub_block(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1
+; CHECK-NEXT:    [[PUSHEDOUTMUL:%.*]] = mul <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[PRODUCT:%.*]] = mul <4 x i32> <i32 8, i32 8, i32 8, i32 8>, <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[PUSHEDOUTADD:%.*]] = add <4 x i32> [[PUSHEDOUTMUL]], <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[INCREMENTPUSHEDOUTMUL:%.*]], [[VECTOR_BODY_END]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50
+; CHECK-NEXT:    br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
+; CHECK:       lower.block:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY_END]]
+; CHECK:       vector.body.end:
+; CHECK-NEXT:    [[INCREMENTPUSHEDOUTMUL]] = add <4 x i32> [[VEC_IND]], [[PRODUCT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+
+vector.ph:
+  %ind.end = shl i32 %n.vec, 1
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
+  %0 = icmp eq i32 %index, 50
+  br i1 %0, label %lower.block, label %end
+
+lower.block:                             ; preds = %vector.body
+  %1 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
+  %2 = add <4 x i32> %1, <i32 6, i32 6, i32 6, i32 6>
+  %3 = getelementptr inbounds i32, i32* %data, <4 x i32> %2
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %4 = getelementptr inbounds i32, i32* %dst, i32 %index
+  %5 = bitcast i32* %4 to <4 x i32>*
+  store <4 x i32> %wide.masked.gather, <4 x i32>* %5, align 4
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  br label %vector.body.end
+
+vector.body.end:                             ; preds = %lower.block
+  %6 = icmp eq i32 %index.next, %n.vec
+  br i1 %6, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+
+define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
+; CHECK-LABEL: @push_out_mul_sub_loop(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ]
+; CHECK-NEXT:    br label [[VECTOR_2_PH:%.*]]
+; CHECK:       vector.2.ph:
+; CHECK-NEXT:    br label [[VECTOR_2_BODY:%.*]]
+; CHECK:       vector.2.body:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[TMP0]], <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[TMP1]], i32 32, i32 2, i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    br label [[VECTOR_2_BODY_END:%.*]]
+; CHECK:       vector.2.body.end:
+; CHECK-NEXT:    [[INDEX_2_NEXT:%.*]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 15
+; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY_END]], label [[VECTOR_2_BODY]]
+; CHECK:       vector.body.end:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[END:%.*]], label [[VECTOR_BODY]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+
+vector.ph:
+  %ind.end = shl i32 %n.vec, 2
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
+  br label %vector.2.ph
+
+vector.2.ph:
+  br label %vector.2.body
+
+vector.2.body:                             ; preds = %vector.body
+  %index.2 = phi i32 [ 0, %vector.2.ph ], [ %index.2.next, %vector.2.body.end ]
+  %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
+  %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %3 = getelementptr inbounds i32, i32* %dst, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
+  br label %vector.2.body.end
+
+vector.2.body.end:                             ; preds = %lower.block
+  %index.2.next = add i32 %index, 4
+  %5 = icmp eq i32 %index.2.next, 15
+  br i1 %5, label %vector.body.end, label %vector.2.body
+
+vector.body.end:                             ; preds = %lower.block
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %6 = icmp eq i32 %index.next, %n.vec
+  br i1 %6, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
new file mode 100644
index 000000000000..a26b17a29aaa
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -0,0 +1,847 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck %s
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"clang version 11.0.0 (git at github.com:llvm/llvm-project.git 26f04d01a39a33d73fd23165c208b215bf5c350d)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = distinct !{!7, !8}
+!8 = !{!"llvm.loop.isvectorized", i32 1}
+!9 = distinct !{!9, !10, !8}
+!10 = !{!"llvm.loop.unroll.runtime.disable"}
+
+
+
+define arm_aapcs_vfpcc void @push_out_mul_gather(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
+; CHECK-LABEL: push_out_mul_gather:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    adr r3, .LCPI0_0
+; CHECK-NEXT:    vmov.i32 q0, #0x18
+; CHECK-NEXT:    vldrw.u32 q1, [r3]
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q2, [r0, q1, uxtw #2]
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
+; CHECK-NEXT:    vstrb.8 q2, [r1], #16
+; CHECK-NEXT:    bne .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %end
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI0_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 6 @ 0x6
+; CHECK-NEXT:    .long 12 @ 0xc
+; CHECK-NEXT:    .long 18 @ 0x12
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %ind.end = shl i32 %n.vec, 1
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
+  %1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %2 = getelementptr inbounds i32, i32* %dst, i32 %index
+  %3 = bitcast i32* %2 to <4 x i32>*
+  store <4 x i32> %wide.masked.gather, <4 x i32>* %3, align 4
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %4 = icmp eq i32 %index.next, %n.vec
+  br i1 %4, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+define arm_aapcs_vfpcc void @push_out_add_gather(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
+; CHECK-LABEL: push_out_add_gather:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    adr r3, .LCPI1_0
+; CHECK-NEXT:    vmov.i32 q1, #0x8
+; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q2, [r0, q0, uxtw #2]
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vstrb.8 q2, [r1], #16
+; CHECK-NEXT:    bne .LBB1_1
+; CHECK-NEXT:  @ %bb.2: @ %end
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI1_0:
+; CHECK-NEXT:    .long 6 @ 0x6
+; CHECK-NEXT:    .long 8 @ 0x8
+; CHECK-NEXT:    .long 10 @ 0xa
+; CHECK-NEXT:    .long 12 @ 0xc
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %ind.end = shl i32 %n.vec, 1
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %0 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
+  %1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %2 = getelementptr inbounds i32, i32* %dst, i32 %index
+  %3 = bitcast i32* %2 to <4 x i32>*
+  store <4 x i32> %wide.masked.gather, <4 x i32>* %3, align 4
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %4 = icmp eq i32 %index.next, %n.vec
+  br i1 %4, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+define arm_aapcs_vfpcc void @push_out_mul_add_gather(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
+; CHECK-LABEL: push_out_mul_add_gather:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    adr r3, .LCPI2_0
+; CHECK-NEXT:    vmov.i32 q0, #0x18
+; CHECK-NEXT:    vldrw.u32 q1, [r3]
+; CHECK-NEXT:  .LBB2_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q2, [r0, q1, uxtw #2]
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
+; CHECK-NEXT:    vstrb.8 q2, [r1], #16
+; CHECK-NEXT:    bne .LBB2_1
+; CHECK-NEXT:  @ %bb.2: @ %end
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI2_0:
+; CHECK-NEXT:    .long 6 @ 0x6
+; CHECK-NEXT:    .long 12 @ 0xc
+; CHECK-NEXT:    .long 18 @ 0x12
+; CHECK-NEXT:    .long 24 @ 0x18
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %ind.end = shl i32 %n.vec, 1
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
+  %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %3 = getelementptr inbounds i32, i32* %dst, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %5 = icmp eq i32 %index.next, %n.vec
+  br i1 %5, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+define arm_aapcs_vfpcc void @push_out_mul_scatter(i32* noalias nocapture readonly %data,
+; CHECK-LABEL: push_out_mul_scatter:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    adr r1, .LCPI3_0
+; CHECK-NEXT:    vmov.i32 q1, #0x18
+; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:  .LBB3_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vstrw.32 q0, [r0, q2, uxtw #2]
+; CHECK-NEXT:    vadd.i32 q2, q2, q1
+; CHECK-NEXT:    bne .LBB3_1
+; CHECK-NEXT:  @ %bb.2: @ %end
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI3_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 6 @ 0x6
+; CHECK-NEXT:    .long 12 @ 0xc
+; CHECK-NEXT:    .long 18 @ 0x12
+                                                  i32* noalias nocapture %dst, i32 %n.vec,
+                                                  <4 x i32> %to.store) {
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %ind.end = shl i32 %n.vec, 1
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
+  %1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %to.store, <4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %2 = icmp eq i32 %index.next, %n.vec
+  br i1 %2, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+define arm_aapcs_vfpcc void @push_out_add_scatter(i32* noalias nocapture readonly %data,
+; CHECK-LABEL: push_out_add_scatter:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    adr r1, .LCPI4_0
+; CHECK-NEXT:    vmov.i32 q2, #0x8
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:  .LBB4_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vstrw.32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    bne .LBB4_1
+; CHECK-NEXT:  @ %bb.2: @ %end
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI4_0:
+; CHECK-NEXT:    .long 6 @ 0x6
+; CHECK-NEXT:    .long 8 @ 0x8
+; CHECK-NEXT:    .long 10 @ 0xa
+; CHECK-NEXT:    .long 12 @ 0xc
+                                                  i32* noalias nocapture %dst, i32 %n.vec,
+                                                  <4 x i32> %to.store) {
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %ind.end = shl i32 %n.vec, 1
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %0 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
+  %1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %to.store, <4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %2 = icmp eq i32 %index.next, %n.vec
+  br i1 %2, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(i32* noalias nocapture readonly %data,
+; CHECK-LABEL: push_out_mul_gather_scatter:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    adr r1, .LCPI5_0
+; CHECK-NEXT:    vmov.i32 q0, #0x18
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:  .LBB5_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q2, [r0, q1, uxtw #2]
+; CHECK-NEXT:    vadd.i32 q3, q1, q0
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vstrw.32 q2, [r0, q1, uxtw #2]
+; CHECK-NEXT:    vmov q1, q3
+; CHECK-NEXT:    bne .LBB5_1
+; CHECK-NEXT:  @ %bb.2: @ %end
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI5_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 6 @ 0x6
+; CHECK-NEXT:    .long 12 @ 0xc
+; CHECK-NEXT:    .long 18 @ 0x12
+                                                         i32* noalias nocapture %dst, i32 %n.vec) {
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %ind.end = shl i32 %n.vec, 1
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
+  %1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %wide.masked.gather, <4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %2 = icmp eq i32 %index.next, %n.vec
+  br i1 %2, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
+; CHECK-LABEL: push_out_add_sub_block:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    adr r3, .LCPI6_0
+; CHECK-NEXT:    vmov.i32 q1, #0x8
+; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:  .LBB6_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q2, [r0, q0, uxtw #2]
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vstrb.8 q2, [r1], #16
+; CHECK-NEXT:    bne .LBB6_1
+; CHECK-NEXT:  @ %bb.2: @ %end
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI6_0:
+; CHECK-NEXT:    .long 6 @ 0x6
+; CHECK-NEXT:    .long 8 @ 0x8
+; CHECK-NEXT:    .long 10 @ 0xa
+; CHECK-NEXT:    .long 12 @ 0xc
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %ind.end = shl i32 %n.vec, 1
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
+  br label %lower.block;
+
+lower.block:                             ; preds = %vector.body
+  %0 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
+  %1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %2 = getelementptr inbounds i32, i32* %dst, i32 %index
+  %3 = bitcast i32* %2 to <4 x i32>*
+  store <4 x i32> %wide.masked.gather, <4 x i32>* %3, align 4
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  br label %vector.body.end
+
+vector.body.end:                             ; preds = %lower.block
+  %4 = icmp eq i32 %index.next, %n.vec
+  br i1 %4, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
+; CHECK-LABEL: non_gatscat_use1:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    adr r3, .LCPI7_0
+; CHECK-NEXT:    vmov.i32 q1, #0x8
+; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:    vmov.i32 q2, #0x6
+; CHECK-NEXT:    vmov.i32 q3, #0x3
+; CHECK-NEXT:  .LBB7_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmul.i32 q4, q0, q3
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.i32 q4, q4, q2
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vldrw.u32 q5, [r0, q4, uxtw #2]
+; CHECK-NEXT:    vstrb.8 q5, [r1], #16
+; CHECK-NEXT:    bne .LBB7_1
+; CHECK-NEXT:  @ %bb.2: @ %end
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI7_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 4 @ 0x4
+; CHECK-NEXT:    .long 6 @ 0x6
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %ind.end = shl i32 %n.vec, 1
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
+  %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %3 = getelementptr inbounds i32, i32* %dst, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
+  %non_gatscat_use = mul <4 x i32> %0, <i32 3, i32 3, i32 3, i32 3>
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %5 = icmp eq i32 %index.next, %n.vec
+  br i1 %5, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
+; CHECK-LABEL: non_gatscat_use2:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    adr r3, .LCPI8_0
+; CHECK-NEXT:    vmov.i32 q1, #0x8
+; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:    vmov.i32 q2, #0x6
+; CHECK-NEXT:    vmov.i32 q3, #0x3
+; CHECK-NEXT:  .LBB8_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmul.i32 q4, q0, q3
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.i32 q4, q4, q2
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vldrw.u32 q5, [r0, q4, uxtw #2]
+; CHECK-NEXT:    vstrb.8 q5, [r1], #16
+; CHECK-NEXT:    bne .LBB8_1
+; CHECK-NEXT:  @ %bb.2: @ %end
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI8_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 4 @ 0x4
+; CHECK-NEXT:    .long 6 @ 0x6
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %ind.end = shl i32 %n.vec, 1
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
+  %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %3 = getelementptr inbounds i32, i32* %dst, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
+  %non_gatscat_use = mul <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %5 = icmp eq i32 %index.next, %n.vec
+  br i1 %5, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) local_unnamed_addr #0 {
+; CHECK-LABEL: arm_mat_mult_q31:
+; CHECK:       @ %bb.0: @ %for.cond8.preheader.us.us.preheader.preheader
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    adr r6, .LCPI9_0
+; CHECK-NEXT:    ldrd r9, r12, [sp, #144]
+; CHECK-NEXT:    vldrw.u32 q0, [r6]
+; CHECK-NEXT:    sub.w r6, r12, #1
+; CHECK-NEXT:    movs r7, #1
+; CHECK-NEXT:    vdup.32 q2, r9
+; CHECK-NEXT:    add.w r6, r7, r6, lsr #1
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    bic r6, r6, #3
+; CHECK-NEXT:    vmul.i32 q0, q0, r9
+; CHECK-NEXT:    subs r6, #4
+; CHECK-NEXT:    vshl.i32 q2, q2, #3
+; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    vmov.i32 q3, #0x8
+; CHECK-NEXT:    add.w r4, r7, r6, lsr #2
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:  .LBB9_1: @ %for.cond8.preheader.us.us.preheader
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB9_2 Depth 2
+; CHECK-NEXT:    @ Child Loop BB9_3 Depth 3
+; CHECK-NEXT:    mul r10, r8, r9
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    mul r7, r8, r12
+; CHECK-NEXT:    vadd.i32 q0, q0, r7
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:  .LBB9_2: @ %vector.ph
+; CHECK-NEXT:    @ Parent Loop BB9_1 Depth=1
+; CHECK-NEXT:    @ => This Loop Header: Depth=2
+; CHECK-NEXT:    @ Child Loop BB9_3 Depth 3
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vadd.i32 q6, q0, r7
+; CHECK-NEXT:    dls lr, r4
+; CHECK-NEXT:  .LBB9_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB9_1 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB9_2 Depth=2
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
+; CHECK-NEXT:    vadd.i32 q1, q7, q3
+; CHECK-NEXT:    vldrw.u32 q4, [r0, q7, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q7, [r1, q6, uxtw #2]
+; CHECK-NEXT:    vadd.i32 q0, q6, q2
+; CHECK-NEXT:    vmov q6, q0
+; CHECK-NEXT:    vmul.i32 q4, q7, q4
+; CHECK-NEXT:    vmov q7, q1
+; CHECK-NEXT:    vadd.i32 q5, q4, q5
+; CHECK-NEXT:    le lr, .LBB9_3
+; CHECK-NEXT:  @ %bb.4: @ %middle.block
+; CHECK-NEXT:    @ in Loop: Header=BB9_2 Depth=2
+; CHECK-NEXT:    add.w r5, r7, r10
+; CHECK-NEXT:    adds r7, #1
+; CHECK-NEXT:    vaddv.u32 r6, q5
+; CHECK-NEXT:    cmp r7, r9
+; CHECK-NEXT:    str.w r6, [r2, r5, lsl #2]
+; CHECK-NEXT:    bne .LBB9_2
+; CHECK-NEXT:  @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB9_1 Depth=1
+; CHECK-NEXT:    add.w r8, r8, #1
+; CHECK-NEXT:    cmp r8, r3
+; CHECK-NEXT:    bne .LBB9_1
+; CHECK-NEXT:  @ %bb.6: @ %for.end25
+; CHECK-NEXT:    add sp, #48
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.7:
+; CHECK-NEXT:  .LCPI9_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 4 @ 0x4
+; CHECK-NEXT:    .long 6 @ 0x6
+
+for.cond8.preheader.us.us.preheader.preheader:    ; preds = %entry
+  %0 = add i32 %l, -1
+  %1 = lshr i32 %0, 1
+  %2 = add nuw i32 %1, 1
+  %min.iters.check = icmp ult i32 %0, 6
+  %n.vec = and i32 %2, -4
+  %ind.end = shl i32 %n.vec, 1
+  %broadcast.splatinsert86 = insertelement <4 x i32> undef, i32 %m, i32 0
+  %broadcast.splat87 = shufflevector <4 x i32> %broadcast.splatinsert86, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp.n = icmp eq i32 %2, %n.vec
+  br label %for.cond8.preheader.us.us.preheader
+
+for.cond8.preheader.us.us.preheader:              ; preds = %for.cond8.preheader.us.us.preheader.preheader, %for.cond4.for.cond.cleanup6_crit_edge.us
+  %i.054.us = phi i32 [ %inc24.us, %for.cond4.for.cond.cleanup6_crit_edge.us ], [ 0, %for.cond8.preheader.us.us.preheader.preheader ]
+  %mul.us = mul i32 %i.054.us, %l
+  %mul18.us = mul i32 %i.054.us, %m
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %mul.us, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.ph
+
+for.cond4.for.cond.cleanup6_crit_edge.us:         ; preds = %for.cond8.for.cond.cleanup10_crit_edge.us.us
+  %inc24.us = add nuw nsw i32 %i.054.us, 1
+  %exitcond85 = icmp eq i32 %inc24.us, %n
+  br i1 %exitcond85, label %for.end25, label %for.cond8.preheader.us.us.preheader
+
+vector.ph:                        ; preds = %middle.block, %for.cond8.preheader.us.us.preheader
+  %j.051.us.us = phi i32 [ %inc.us.us, %middle.block ], [ 0, %for.cond8.preheader.us.us.preheader ]
+  %broadcast.splatinsert88 = insertelement <4 x i32> undef, i32 %j.051.us.us, i32 0
+  %broadcast.splat89 = shufflevector <4 x i32> %broadcast.splatinsert88, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
+  %3 = add <4 x i32> %vec.ind, %broadcast.splat
+  %4 = getelementptr inbounds i32, i32* %A, <4 x i32> %3
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %4, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa !3
+  %5 = mul <4 x i32> %vec.ind, %broadcast.splat87
+  %6 = add <4 x i32> %5, %broadcast.splat89
+  %7 = getelementptr inbounds i32, i32* %B, <4 x i32> %6
+  %wide.masked.gather90 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %7, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa !3
+  %8 = mul nsw <4 x i32> %wide.masked.gather90, %wide.masked.gather
+  %9 = add <4 x i32> %8, %vec.phi
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %10 = icmp eq i32 %index.next, %n.vec
+  br i1 %10, label %middle.block, label %vector.body, !llvm.loop !7
+
+middle.block:                                     ; preds = %vector.body
+  %11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9)
+;for.cond8.for.cond.cleanup10_crit_edge.us.us:     ; preds = %for.body11.us.us, %middle.block
+  %add19.us.us = add i32 %j.051.us.us, %mul18.us
+  %arrayidx20.us.us = getelementptr inbounds i32, i32* %C, i32 %add19.us.us
+  store i32 %11, i32* %arrayidx20.us.us, align 4, !tbaa !3
+  %inc.us.us = add nuw nsw i32 %j.051.us.us, 1
+  %exitcond = icmp eq i32 %inc.us.us, %m
+  br i1 %exitcond, label %for.cond4.for.cond.cleanup6_crit_edge.us, label %vector.ph
+
+for.end25:                                        ; preds = %for.cond4.for.cond.cleanup6_crit_edge.us, %entry
+  ret void
+}
+
+define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* noalias nocapture readonly %B, i16* noalias nocapture %C, i32 %n, i32 %m, i32 %l) local_unnamed_addr #0 {
+; CHECK-LABEL: arm_mat_mult_q15:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    strd r0, r2, [sp, #24] @ 8-byte Folded Spill
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    itt ne
+; CHECK-NEXT:    ldrne.w lr, [sp, #104]
+; CHECK-NEXT:    cmpne.w lr, #0
+; CHECK-NEXT:    bne .LBB10_2
+; CHECK-NEXT:  .LBB10_1: @ %for.cond.cleanup
+; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NEXT:  .LBB10_2: @ %for.cond1.preheader.us.preheader
+; CHECK-NEXT:    ldr.w r11, [sp, #108]
+; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    movs r1, #1
+; CHECK-NEXT:    lsl.w r4, lr, #1
+; CHECK-NEXT:    bic r0, r11, #3
+; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    subs r0, #4
+; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    add.w r8, r1, r0, lsr #2
+; CHECK-NEXT:    lsl.w r0, r11, #1
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    adr r0, .LCPI10_0
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    vmul.i32 q4, q0, lr
+; CHECK-NEXT:    vdup.32 q0, lr
+; CHECK-NEXT:    vshl.i32 q5, q0, #2
+; CHECK-NEXT:    b .LBB10_5
+; CHECK-NEXT:  .LBB10_3: @ %for.cond5.preheader.us73.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    mov r1, r4
+; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
+; CHECK-NEXT:    bl __aeabi_memclr
+; CHECK-NEXT:    ldr.w lr, [sp, #104]
+; CHECK-NEXT:  .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    add r9, r11
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    add r1, r0
+; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, #1
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    beq .LBB10_1
+; CHECK-NEXT:  .LBB10_5: @ %for.cond1.preheader.us
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB10_8 Depth 2
+; CHECK-NEXT:    @ Child Loop BB10_11 Depth 3
+; CHECK-NEXT:    @ Child Loop BB10_14 Depth 3
+; CHECK-NEXT:    mul r12, r1, lr
+; CHECK-NEXT:    cmp.w r11, #0
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    beq .LBB10_3
+; CHECK-NEXT:  @ %bb.6: @ %for.cond5.preheader.us.us.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    b .LBB10_8
+; CHECK-NEXT:  .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
+; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
+; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    add.w r3, r10, r12
+; CHECK-NEXT:    add.w r10, r10, #1
+; CHECK-NEXT:    cmp r10, lr
+; CHECK-NEXT:    strh.w r2, [r0, r3, lsl #1]
+; CHECK-NEXT:    beq .LBB10_4
+; CHECK-NEXT:  .LBB10_8: @ %for.cond5.preheader.us.us
+; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
+; CHECK-NEXT:    @ => This Loop Header: Depth=2
+; CHECK-NEXT:    @ Child Loop BB10_11 Depth 3
+; CHECK-NEXT:    @ Child Loop BB10_14 Depth 3
+; CHECK-NEXT:    cmp.w r11, #3
+; CHECK-NEXT:    bhi .LBB10_10
+; CHECK-NEXT:  @ %bb.9: @ in Loop: Header=BB10_8 Depth=2
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    b .LBB10_13
+; CHECK-NEXT:  .LBB10_10: @ %vector.ph
+; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    vadd.i32 q1, q4, r10
+; CHECK-NEXT:    dls lr, r8
+; CHECK-NEXT:  .LBB10_11: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB10_8 Depth=2
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
+; CHECK-NEXT:    vadd.i32 q2, q1, q5
+; CHECK-NEXT:    vldrh.s32 q3, [r6, q1, uxtw #1]
+; CHECK-NEXT:    vldrh.s32 q1, [r2], #8
+; CHECK-NEXT:    vmul.i32 q1, q3, q1
+; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vmov q1, q2
+; CHECK-NEXT:    le lr, .LBB10_11
+; CHECK-NEXT:  @ %bb.12: @ %middle.block
+; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
+; CHECK-NEXT:    ldr r7, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    vaddv.u32 r2, q0
+; CHECK-NEXT:    ldr.w lr, [sp, #104]
+; CHECK-NEXT:    cmp r7, r11
+; CHECK-NEXT:    beq .LBB10_7
+; CHECK-NEXT:  .LBB10_13: @ %for.body8.us.us.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
+; CHECK-NEXT:    mla r3, lr, r7, r10
+; CHECK-NEXT:    sub.w r5, r11, r7
+; CHECK-NEXT:    add r7, r9
+; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    add.w r7, r0, r7, lsl #1
+; CHECK-NEXT:    add.w r3, r6, r3, lsl #1
+; CHECK-NEXT:  .LBB10_14: @ %for.body8.us.us
+; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB10_8 Depth=2
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
+; CHECK-NEXT:    ldrsh.w r1, [r3]
+; CHECK-NEXT:    add r3, r4
+; CHECK-NEXT:    ldrsh r0, [r7], #2
+; CHECK-NEXT:    subs r5, #1
+; CHECK-NEXT:    smlabb r2, r1, r0, r2
+; CHECK-NEXT:    bne .LBB10_14
+; CHECK-NEXT:    b .LBB10_7
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.15:
+; CHECK-NEXT:  .LCPI10_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 3 @ 0x3
+entry:
+  %cmp48 = icmp eq i32 %n, 0
+  br i1 %cmp48, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp245 = icmp eq i32 %m, 0
+  %cmp642 = icmp eq i32 %l, 0
+  br i1 %cmp245, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
+
+for.cond1.preheader.us.preheader:                 ; preds = %for.cond1.preheader.lr.ph
+  %0 = shl nuw i32 %m, 1
+  %min.iters.check = icmp ult i32 %l, 4
+  %n.vec = and i32 %l, -4
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %m, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp.n = icmp eq i32 %n.vec, %l
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
+  %i.049.us = phi i32 [ %inc23.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %1 = mul i32 %i.049.us, %m
+  %mul.us = mul i32 %i.049.us, %l
+  br i1 %cmp642, label %for.cond5.preheader.us73.preheader, label %for.cond5.preheader.us.us
+
+for.cond5.preheader.us73.preheader:               ; preds = %for.cond1.preheader.us
+  %scevgep = getelementptr i16, i16* %C, i32 %1
+  %scevgep82 = bitcast i16* %scevgep to i8*
+  call void @llvm.memset.p0i8.i32(i8* align 2 %scevgep82, i8 0, i32 %0, i1 false)
+  br label %for.cond1.for.cond.cleanup3_crit_edge.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us.us, %for.cond5.preheader.us73.preheader
+  %inc23.us = add nuw nsw i32 %i.049.us, 1
+  %exitcond84 = icmp eq i32 %inc23.us, %n
+  br i1 %exitcond84, label %for.cond.cleanup, label %for.cond1.preheader.us
+
+for.cond5.preheader.us.us:                        ; preds = %for.cond1.preheader.us, %for.cond5.for.cond.cleanup7_crit_edge.us.us
+  %j.046.us.us = phi i32 [ %inc20.us.us, %for.cond5.for.cond.cleanup7_crit_edge.us.us ], [ 0, %for.cond1.preheader.us ]
+  br i1 %min.iters.check, label %for.body8.us.us.preheader, label %vector.ph
+
+for.body8.us.us.preheader:                        ; preds = %middle.block, %for.cond5.preheader.us.us
+  %k.044.us.us.ph = phi i32 [ 0, %for.cond5.preheader.us.us ], [ %n.vec, %middle.block ]
+  %sum.043.us.us.ph = phi i32 [ 0, %for.cond5.preheader.us.us ], [ %13, %middle.block ]
+  br label %for.body8.us.us
+
+vector.ph:                                        ; preds = %for.cond5.preheader.us.us
+  %broadcast.splatinsert85 = insertelement <4 x i32> undef, i32 %j.046.us.us, i32 0
+  %broadcast.splat86 = shufflevector <4 x i32> %broadcast.splatinsert85, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ]
+  %2 = add i32 %index, %mul.us
+  %3 = getelementptr inbounds i16, i16* %A, i32 %2
+  %4 = bitcast i16* %3 to <4 x i16>*
+  %wide.load = load <4 x i16>, <4 x i16>* %4, align 2, !tbaa !3
+  %5 = sext <4 x i16> %wide.load to <4 x i32>
+  %6 = mul <4 x i32> %vec.ind, %broadcast.splat
+  %7 = add <4 x i32> %6, %broadcast.splat86
+  %8 = getelementptr inbounds i16, i16* %B, <4 x i32> %7
+  %wide.masked.gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %8, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef), !tbaa !3
+  %9 = sext <4 x i16> %wide.masked.gather to <4 x i32>
+  %10 = mul nsw <4 x i32> %9, %5
+  %11 = add <4 x i32> %10, %vec.phi
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
+  %12 = icmp eq i32 %index.next, %n.vec
+  br i1 %12, label %middle.block, label %vector.body, !llvm.loop !7
+
+middle.block:                                     ; preds = %vector.body
+  %13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %11)
+  br i1 %cmp.n, label %for.cond5.for.cond.cleanup7_crit_edge.us.us, label %for.body8.us.us.preheader
+
+for.cond5.for.cond.cleanup7_crit_edge.us.us:      ; preds = %for.body8.us.us, %middle.block
+  %add14.us.us.lcssa = phi i32 [ %13, %middle.block ], [ %add14.us.us, %for.body8.us.us ]
+  %conv15.us.us = trunc i32 %add14.us.us.lcssa to i16
+  %add17.us.us = add i32 %j.046.us.us, %1
+  %arrayidx18.us.us = getelementptr inbounds i16, i16* %C, i32 %add17.us.us
+  store i16 %conv15.us.us, i16* %arrayidx18.us.us, align 2, !tbaa !3
+  %inc20.us.us = add nuw nsw i32 %j.046.us.us, 1
+  %exitcond83 = icmp eq i32 %inc20.us.us, %m
+  br i1 %exitcond83, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.cond5.preheader.us.us
+
+for.body8.us.us:                                  ; preds = %for.body8.us.us.preheader, %for.body8.us.us
+  %k.044.us.us = phi i32 [ %inc.us.us, %for.body8.us.us ], [ %k.044.us.us.ph, %for.body8.us.us.preheader ]
+  %sum.043.us.us = phi i32 [ %add14.us.us, %for.body8.us.us ], [ %sum.043.us.us.ph, %for.body8.us.us.preheader ]
+  %add.us.us = add i32 %k.044.us.us, %mul.us
+  %arrayidx.us.us = getelementptr inbounds i16, i16* %A, i32 %add.us.us
+  %14 = load i16, i16* %arrayidx.us.us, align 2, !tbaa !3
+  %conv.us.us = sext i16 %14 to i32
+  %mul9.us.us = mul i32 %k.044.us.us, %m
+  %add10.us.us = add i32 %mul9.us.us, %j.046.us.us
+  %arrayidx11.us.us = getelementptr inbounds i16, i16* %B, i32 %add10.us.us
+  %15 = load i16, i16* %arrayidx11.us.us, align 2, !tbaa !3
+  %conv12.us.us = sext i16 %15 to i32
+  %mul13.us.us = mul nsw i32 %conv12.us.us, %conv.us.us
+  %add14.us.us = add nsw i32 %mul13.us.us, %sum.043.us.us
+  %inc.us.us = add nuw nsw i32 %k.044.us.us, 1
+  %exitcond = icmp eq i32 %inc.us.us, %l
+  br i1 %exitcond, label %for.cond5.for.cond.cleanup7_crit_edge.us.us, label %for.body8.us.us, !llvm.loop !9
+
+for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry
+  ret void
+}
+
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>)
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare void @llvm.memset.p0i8.i32(i8* align 2, i8, i32, i1)
+
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)