[llvm] r356132 - [ARM][ParallelDSP] Enable multiple uses of loads

Sam Parker via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 14 04:14:13 PDT 2019


Author: sam_parker
Date: Thu Mar 14 04:14:13 2019
New Revision: 356132

URL: http://llvm.org/viewvc/llvm-project?rev=356132&view=rev
Log:
[ARM][ParallelDSP] Enable multiple uses of loads
    
When choosing whether a pair of loads can be combined into a single
wide load, we check that the load only has a sext user and that sext
also only has one user. But this can prevent the transformation in
the cases when parallel macs use the same loaded data multiple times.
    
To enable this, we need to fix up any other uses after creating the
wide load: generating a trunc and a shift + trunc pair to recreate
the narrow values. We also need to keep a record of which loads have
already been widened.

Differential Revision: https://reviews.llvm.org/D59215

Added:
    llvm/trunk/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
    llvm/trunk/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll
Modified:
    llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
    llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad0.ll

Modified: llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp?rev=356132&r1=356131&r2=356132&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp Thu Mar 14 04:14:13 2019
@@ -53,7 +53,7 @@ namespace {
   using OpChainList     = SmallVector<std::unique_ptr<OpChain>, 8>;
   using ReductionList   = SmallVector<Reduction, 8>;
   using ValueList       = SmallVector<Value*, 8>;
-  using MemInstList     = SmallVector<Instruction*, 8>;
+  using MemInstList     = SmallVector<LoadInst*, 8>;
   using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
   using PMACPairList    = SmallVector<PMACPair, 8>;
   using Instructions    = SmallVector<Instruction*,16>;
@@ -113,6 +113,21 @@ namespace {
     Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
   };
 
+  class WidenedLoad {
+    LoadInst *NewLd = nullptr;
+    SmallVector<LoadInst*, 4> Loads;
+
+  public:
+    WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
+      : NewLd(Wide) {
+      for (auto *I : Lds)
+        Loads.push_back(I);
+    }
+    LoadInst *getLoad() {
+      return NewLd;
+    }
+  };
+
   class ARMParallelDSP : public LoopPass {
     ScalarEvolution   *SE;
     AliasAnalysis     *AA;
@@ -123,13 +138,17 @@ namespace {
     const DataLayout  *DL;
     Module            *M;
     std::map<LoadInst*, LoadInst*> LoadPairs;
-    std::map<LoadInst*, SmallVector<LoadInst*, 4>> SequentialLoads;
+    std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
 
-    bool RecordSequentialLoads(BasicBlock *Header);
+    bool RecordSequentialLoads(BasicBlock *BB);
     bool InsertParallelMACs(Reduction &Reduction);
     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
+    LoadInst* CreateLoadIns(IRBuilder<NoFolder> &IRB,
+                            SmallVectorImpl<LoadInst*> &Loads,
+                            IntegerType *LoadTy);
     void CreateParallelMACPairs(Reduction &R);
-    Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
+    Instruction *CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
+                                 SmallVectorImpl<LoadInst*> &VecLd1,
                                  Instruction *Acc, bool Exchange,
                                  Instruction *InsertAfter);
 
@@ -202,7 +221,6 @@ namespace {
       }
 
       LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
-      bool Changes = false;
 
       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
@@ -212,7 +230,7 @@ namespace {
         return false;
       }
 
-      Changes = MatchSMLAD(F);
+      bool Changes = MatchSMLAD(F);
       return Changes;
     }
   };
@@ -225,7 +243,6 @@ namespace {
 // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
 template<unsigned MaxBitWidth>
 static bool IsNarrowSequence(Value *V, ValueList &VL) {
-  LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
   ConstantInt *CInt;
 
   if (match(V, m_ConstantInt(CInt))) {
@@ -244,38 +261,25 @@ static bool IsNarrowSequence(Value *V, V
   } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
     // TODO: we need to implement sadd16/sadd8 for this, which enables to
     // also do the rewrite for smlad8.ll, but it is unsupported for now.
-    LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
     return false;
   } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
-    if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
-      LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
-        cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
+    if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
       return false;
-    }
 
     if (match(Val, m_Load(m_Value()))) {
-      LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
       VL.push_back(Val);
       VL.push_back(I);
       return true;
     }
   }
-  LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
   return false;
 }
 
 template<typename MemInst>
 static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
                                   const DataLayout &DL, ScalarEvolution &SE) {
-  if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
-    LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
-    return false;
-  }
-  if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
-    LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
+  if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE))
     return true;
-  }
-  LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n");
   return false;
 }
 
@@ -284,19 +288,14 @@ bool ARMParallelDSP::AreSequentialLoads(
   if (!Ld0 || !Ld1)
     return false;
 
-  LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
+  if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
     dbgs() << "Ld0:"; Ld0->dump();
     dbgs() << "Ld1:"; Ld1->dump();
   );
 
-  if (!Ld0->hasOneUse() || !Ld1->hasOneUse()) {
-    LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
-    return false;
-  }
-
-  if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
-    return false;
-
   VecMem.clear();
   VecMem.push_back(Ld0);
   VecMem.push_back(Ld1);
@@ -305,17 +304,16 @@ bool ARMParallelDSP::AreSequentialLoads(
 
 /// Iterate through the block and record base, offset pairs of loads as well as
 /// maximal sequences of sequential loads.
-bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) {
+bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *BB) {
   SmallVector<LoadInst*, 8> Loads;
-  for (auto &I : *Header) {
+  for (auto &I : *BB) {
     auto *Ld = dyn_cast<LoadInst>(&I);
-    if (!Ld)
+    if (!Ld || !Ld->isSimple() ||
+        !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
       continue;
     Loads.push_back(Ld);
   }
 
-  std::map<LoadInst*, LoadInst*> BaseLoads;
-
   for (auto *Ld0 : Loads) {
     for (auto *Ld1 : Loads) {
       if (Ld0 == Ld1)
@@ -323,17 +321,18 @@ bool ARMParallelDSP::RecordSequentialLoa
 
       if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
         LoadPairs[Ld0] = Ld1;
-        if (BaseLoads.count(Ld0)) {
-          LoadInst *Base = BaseLoads[Ld0];
-          BaseLoads[Ld1] = Base;
-          SequentialLoads[Base].push_back(Ld1);
-        } else {
-          BaseLoads[Ld1] = Ld0;
-          SequentialLoads[Ld0].push_back(Ld1);
-        }
+        break;
       }
     }
   }
+
+  LLVM_DEBUG(if (!LoadPairs.empty()) {
+               dbgs() << "Consecutive load pairs:\n";
+               for (auto &MapIt : LoadPairs) {
+                 LLVM_DEBUG(dbgs() << *MapIt.first << ", "
+                            << *MapIt.second << "\n");
+               }
+             });
   return LoadPairs.size() > 1;
 }
 
@@ -362,12 +361,11 @@ void ARMParallelDSP::CreateParallelMACPa
       if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
         return false;
 
-      LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
-                 << "\t Ld0: " << *Ld0 << "\n"
-                 << "\t Ld1: " << *Ld1 << "\n"
-                 << "and operands " << x + 2 << ":\n"
-                 << "\t Ld2: " << *Ld2 << "\n"
-                 << "\t Ld3: " << *Ld3 << "\n");
+      LLVM_DEBUG(dbgs() << "Loads:\n"
+                 << " - " << *Ld0 << "\n"
+                 << " - " << *Ld1 << "\n"
+                 << " - " << *Ld2 << "\n"
+                 << " - " << *Ld3 << "\n");
 
       if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
         if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
@@ -416,11 +414,6 @@ void ARMParallelDSP::CreateParallelMACPa
 
       assert(PMul0 != PMul1 && "expected different chains");
 
-      LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
-                 dbgs() << "- "; Mul0->dump();
-                 dbgs() << "- "; Mul1->dump());
-
-      LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
       if (CanPair(PMul0, PMul1)) {
         Paired.insert(Mul0);
         Paired.insert(Mul1);
@@ -441,9 +434,8 @@ bool ARMParallelDSP::InsertParallelMACs(
                dbgs() << "- "; PMul0->Root->dump();
                dbgs() << "- "; PMul1->Root->dump());
 
-    auto *VecLd0 = cast<LoadInst>(PMul0->VecLd[0]);
-    auto *VecLd1 = cast<LoadInst>(PMul1->VecLd[0]);
-    Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, PMul1->Exchange, InsertAfter);
+    Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
+                          InsertAfter);
     InsertAfter = Acc;
   }
 
@@ -499,14 +491,12 @@ static void MatchReductions(Function &F,
 static void AddMACCandidate(OpChainList &Candidates,
                             Instruction *Mul,
                             Value *MulOp0, Value *MulOp1) {
-  LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
   assert(Mul->getOpcode() == Instruction::Mul &&
          "expected mul instruction");
   ValueList LHS;
   ValueList RHS;
   if (IsNarrowSequence<16>(MulOp0, LHS) &&
       IsNarrowSequence<16>(MulOp1, RHS)) {
-    LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
     Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
   }
 }
@@ -514,7 +504,7 @@ static void AddMACCandidate(OpChainList
 static void MatchParallelMACSequences(Reduction &R,
                                       OpChainList &Candidates) {
   Instruction *Acc = R.AccIntAdd;
-  LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc);
+  LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc << "\n");
 
   // Returns false to signal the search should be stopped.
   std::function<bool(Value*)> Match =
@@ -687,32 +677,81 @@ bool ARMParallelDSP::MatchSMLAD(Function
   return Changed;
 }
 
-static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst &BaseLoad,
-                               Type *LoadTy) {
-  const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
-
-  Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
+LoadInst* ARMParallelDSP::CreateLoadIns(IRBuilder<NoFolder> &IRB,
+                                        SmallVectorImpl<LoadInst*> &Loads,
+                                        IntegerType *LoadTy) {
+  assert(Loads.size() == 2 && "currently only support widening two loads");
+ 
+  const unsigned AddrSpace = Loads[0]->getPointerAddressSpace();
+  Value *VecPtr = IRB.CreateBitCast(Loads[0]->getPointerOperand(),
                                     LoadTy->getPointerTo(AddrSpace));
-  return IRB.CreateAlignedLoad(LoadTy, VecPtr, BaseLoad.getAlignment());
+  LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
+                                             Loads[0]->getAlignment());
+  // Fix up users, Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
+  Instruction *SExt0 = dyn_cast<SExtInst>(Loads[0]->user_back());
+  Instruction *SExt1 = dyn_cast<SExtInst>(Loads[1]->user_back());
+
+  assert((Loads[0]->hasOneUse() && Loads[1]->hasOneUse() && SExt0 && SExt1) &&
+         "Loads should have a single, extending, user");
+
+  std::function<void(Instruction*, Instruction*)> MoveAfter =
+    [&](Instruction* Source, Instruction* Sink) -> void {
+    if (DT->dominates(Source, Sink) ||
+        Source->getParent() != Sink->getParent() ||
+        isa<PHINode>(Source) || isa<PHINode>(Sink))
+      return;
+
+    Sink->moveAfter(Source);
+    for (auto &U : Sink->uses())
+      MoveAfter(Sink, cast<Instruction>(U.getUser()));
+  };
+
+  // From the wide load, create two values that equal the original two loads.
+  Value *Bottom = IRB.CreateTrunc(WideLoad, Loads[0]->getType());
+  SExt0->setOperand(0, Bottom);
+  if (auto *I = dyn_cast<Instruction>(Bottom)) {
+    I->moveAfter(WideLoad);
+    MoveAfter(I, SExt0);
+  }
+
+  IntegerType *Ld1Ty = cast<IntegerType>(Loads[1]->getType());
+  Value *ShiftVal = ConstantInt::get(LoadTy, Ld1Ty->getBitWidth());
+  Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
+  if (auto *I = dyn_cast<Instruction>(Top))
+    MoveAfter(WideLoad, I);
+
+  Value *Trunc = IRB.CreateTrunc(Top, Ld1Ty);
+  SExt1->setOperand(0, Trunc);
+  if (auto *I = dyn_cast<Instruction>(Trunc))
+    MoveAfter(I, SExt1);
+
+  WideLoads.emplace(std::make_pair(Loads[0],
+                                   make_unique<WidenedLoad>(Loads, WideLoad)));
+  return WideLoad;
 }
 
-Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
+Instruction *ARMParallelDSP::CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
+                                             SmallVectorImpl<LoadInst*> &VecLd1,
                                              Instruction *Acc, bool Exchange,
                                              Instruction *InsertAfter) {
   LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n"
-             << "- " << *VecLd0 << "\n"
-             << "- " << *VecLd1 << "\n"
+             << "- " << *VecLd0[0] << "\n"
+             << "- " << *VecLd0[1] << "\n"
+             << "- " << *VecLd1[0] << "\n"
+             << "- " << *VecLd1[1] << "\n"
              << "- " << *Acc << "\n"
-             << "Exchange: " << Exchange << "\n");
+             << "- Exchange: " << Exchange << "\n");
 
   IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
                               ++BasicBlock::iterator(InsertAfter));
 
   // Replace the reduction chain with an intrinsic call
-  Type *Ty = IntegerType::get(M->getContext(), 32);
-  LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
-  LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
-  Value* Args[] = { NewLd0, NewLd1, Acc };
+  IntegerType *Ty = IntegerType::get(M->getContext(), 32);
+  LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
+    WideLoads[VecLd0[0]]->getLoad() : CreateLoadIns(Builder, VecLd0, Ty);
+  LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
+    WideLoads[VecLd1[0]]->getLoad() : CreateLoadIns(Builder, VecLd1, Ty);
+  Value* Args[] = { WideLd0, WideLd1, Acc };
   Function *SMLAD = nullptr;
   if (Exchange)
     SMLAD = Acc->getType()->isIntegerTy(32) ?
@@ -740,7 +779,6 @@ bool BinOpChain::AreSymmetrical(BinOpCha
     }
 
     const unsigned Pairs = VL0.size();
-    LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
 
     for (unsigned i = 0; i < Pairs; ++i) {
       const Value *V0 = VL0[i];
@@ -748,24 +786,17 @@ bool BinOpChain::AreSymmetrical(BinOpCha
       const auto *Inst0 = dyn_cast<Instruction>(V0);
       const auto *Inst1 = dyn_cast<Instruction>(V1);
 
-      LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
-                dbgs() << "mul1: "; V0->dump();
-                dbgs() << "mul2: "; V1->dump());
-
       if (!Inst0 || !Inst1)
         return false;
 
-      if (Inst0->isSameOperationAs(Inst1)) {
-        LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
+      if (Inst0->isSameOperationAs(Inst1))
         continue;
-      }
 
       const APInt *C0, *C1;
       if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
         return false;
     }
 
-    LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
     return true;
   };
 

Added: llvm/trunk/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll?rev=356132&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll Thu Mar 14 04:14:13 2019
@@ -0,0 +1,251 @@
+; RUN: llc -O3 -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s | FileCheck %s
+
+; CHECK-LABEL: add_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxtah [[COUNT:r[0-9]+]], [[COUNT]], [[A]]
+define i32 @add_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %count.next = add i32 %conv4, %count
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: mul_bottom_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxth [[SXT:r[0-9]+]], [[A]]
+; CHECK: mul [[COUNT:r[0-9]+]], [[SXT]], [[COUNT]]
+define i32 @mul_bottom_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %count.next = mul i32 %conv4, %count
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: mul_top_user
+; CHECK: %for.body
+; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: asr.w [[ASR:[rl0-9]+]], [[ASR]], #16
+; CHECK: mul [[COUNT:[rl0-9]+]], [[ASR]], [[COUNT]]
+define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %count.next = mul i32 %conv7, %count
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: and_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: uxth [[UXT:r[0-9]+]], [[A]]
+; CHECK: mul [[MUL:r[0-9]+]], [[UXT]], [[MUL]]
+define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %bottom = and i32 %conv4, 65535
+  %mul = mul nsw i32 %conv, %conv4
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %count.next = mul i32 %bottom, %count
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: multi_uses
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]], [{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]], [{{.*}}, #2]!
+; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxth [[SXT:r[0-9]+]], [[A]]
+; CHECK: eor.w [[EOR:r[0-9]+]], [[SXT]], [[SHIFT:r[0-9]+]]
+; CHECK: mul [[MUL:r[0-9]+]], [[EOR]], [[SXT]]
+; CHECK: lsl.w [[SHIFT]], [[MUL]], #16
+define i32 @multi_uses(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %bottom = and i32 %conv4, 65535
+  %mul = mul nsw i32 %conv, %conv4
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %shl = shl i32 %conv4, 16
+  %add11 = add i32 %mul9, %add10
+  %xor = xor i32 %bottom, %count
+  %count.next = mul i32 %xor, %shl
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}

Modified: llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad0.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad0.ll?rev=356132&r1=356131&r2=356132&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad0.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad0.ll Thu Mar 14 04:14:13 2019
@@ -210,3 +210,4 @@ for.body:
   %exitcond = icmp ne i32 %add, %arg
   br i1 %exitcond, label %for.body, label %for.cond.cleanup
 }
+

Added: llvm/trunk/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll?rev=356132&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll Thu Mar 14 04:14:13 2019
@@ -0,0 +1,217 @@
+; RUN: llc -O3 -mtriple=thumbv7em %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple=thumbv8m.main -mattr=+dsp %s -o - | FileCheck %s
+
+; Test that the duplicate loads are removed, which allows parallel dsp to find
+; the parallel operations.
+
+define void @unroll_n_jam_smlad(i32* %res, i16* %A, i16* %B, i32 %N, i32 %idx) {
+entry:
+  %xtraiter306.i = and i32 %N, 3
+  %unroll_iter310.i = sub i32 %N, %xtraiter306.i
+  %arrayidx.us.i117.i = getelementptr inbounds i32, i32* %res, i32 %idx
+  store i32 0, i32* %arrayidx.us.i117.i, align 4
+  %mul.us.i118.i = mul i32 %idx, %N
+  %inc11.us.i.i = or i32 %idx, 1
+  %arrayidx.us.i117.1.i = getelementptr inbounds i32, i32* %res, i32 %inc11.us.i.i
+  store i32 0, i32* %arrayidx.us.i117.1.i, align 4
+  %mul.us.i118.1.i = mul i32 %inc11.us.i.i, %N
+  %inc11.us.i.1.i = or i32 %idx, 2
+  %arrayidx.us.i117.2.i = getelementptr inbounds i32, i32* %res, i32 %inc11.us.i.1.i
+  store i32 0, i32* %arrayidx.us.i117.2.i, align 4
+  %mul.us.i118.2.i = mul i32 %inc11.us.i.1.i, %N
+  %inc11.us.i.2.i = or i32 %idx, 3
+  %arrayidx.us.i117.3.i = getelementptr inbounds i32, i32* %res, i32 %inc11.us.i.2.i
+  store i32 0, i32* %arrayidx.us.i117.3.i, align 4
+  %mul.us.i118.3.i = mul i32 %inc11.us.i.2.i, %N
+  %inc11.us.i.3.i = add i32 %idx, 4
+  br label %for.body
+
+; CHECK: %for.body
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+
+for.body:
+  %A3 = phi i32 [ %add9.us.i.3361.i, %for.body ], [ 0, %entry ]
+  %j.026.us.i.i = phi i32 [ %inc.us.i.3362.i, %for.body ], [ 0, %entry ]
+  %A4 = phi i32 [ %add9.us.i.1.3.i, %for.body ], [ 0, %entry ]
+  %A5 = phi i32 [ %add9.us.i.2.3.i, %for.body ], [ 0, %entry ]
+  %A6 = phi i32 [ %add9.us.i.3.3.i, %for.body ], [ 0, %entry ]
+  %niter335.i = phi i32 [ %niter335.nsub.3.i, %for.body ], [ %unroll_iter310.i, %entry ]
+  %add.us.i.i = add i32 %j.026.us.i.i, %mul.us.i118.i
+  %arrayidx4.us.i.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.i
+  %A7 = load i16, i16* %arrayidx4.us.i.i, align 2
+  %conv.us.i.i = sext i16 %A7 to i32
+  %arrayidx5.us.i.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i
+  %A8 = load i16, i16* %arrayidx5.us.i.i, align 2
+  %conv6.us.i.i = sext i16 %A8 to i32
+  %mul7.us.i.i = mul nsw i32 %conv6.us.i.i, %conv.us.i.i
+  %add9.us.i.i = add nsw i32 %mul7.us.i.i, %A3
+  %inc.us.i.i = or i32 %j.026.us.i.i, 1
+  %add.us.i.1.i = add i32 %j.026.us.i.i, %mul.us.i118.1.i
+  %arrayidx4.us.i.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.i
+  %A9 = load i16, i16* %arrayidx4.us.i.1.i, align 2
+  %conv.us.i.1.i = sext i16 %A9 to i32
+  %arrayidx5.us.i.1.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i
+  %B0 = load i16, i16* %arrayidx5.us.i.1.i, align 2
+  %conv6.us.i.1.i = sext i16 %B0 to i32
+  %mul7.us.i.1.i = mul nsw i32 %conv6.us.i.1.i, %conv.us.i.1.i
+  %add9.us.i.1.i = add nsw i32 %mul7.us.i.1.i, %A4
+  %inc.us.i.1.i = or i32 %j.026.us.i.i, 1
+  %add.us.i.2.i = add i32 %j.026.us.i.i, %mul.us.i118.2.i
+  %arrayidx4.us.i.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.i
+  %B1 = load i16, i16* %arrayidx4.us.i.2.i, align 2
+  %conv.us.i.2.i = sext i16 %B1 to i32
+  %arrayidx5.us.i.2.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i
+  %B2 = load i16, i16* %arrayidx5.us.i.2.i, align 2
+  %conv6.us.i.2.i = sext i16 %B2 to i32
+  %mul7.us.i.2.i = mul nsw i32 %conv6.us.i.2.i, %conv.us.i.2.i
+  %add9.us.i.2.i = add nsw i32 %mul7.us.i.2.i, %A5
+  %inc.us.i.2.i = or i32 %j.026.us.i.i, 1
+  %add.us.i.3.i = add i32 %j.026.us.i.i, %mul.us.i118.3.i
+  %arrayidx4.us.i.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.i
+  %B3 = load i16, i16* %arrayidx4.us.i.3.i, align 2
+  %conv.us.i.3.i = sext i16 %B3 to i32
+  %arrayidx5.us.i.3.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i
+  %B4 = load i16, i16* %arrayidx5.us.i.3.i, align 2
+  %conv6.us.i.3.i = sext i16 %B4 to i32
+  %mul7.us.i.3.i = mul nsw i32 %conv6.us.i.3.i, %conv.us.i.3.i
+  %add9.us.i.3.i = add nsw i32 %mul7.us.i.3.i, %A6
+  %inc.us.i.3.i = or i32 %j.026.us.i.i, 1
+  %add.us.i.1337.i = add i32 %inc.us.i.i, %mul.us.i118.i
+  %arrayidx4.us.i.1338.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1337.i
+  %B5 = load i16, i16* %arrayidx4.us.i.1338.i, align 2
+  %conv.us.i.1339.i = sext i16 %B5 to i32
+  %arrayidx5.us.i.1340.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.i
+  %B6 = load i16, i16* %arrayidx5.us.i.1340.i, align 2
+  %conv6.us.i.1341.i = sext i16 %B6 to i32
+  %mul7.us.i.1342.i = mul nsw i32 %conv6.us.i.1341.i, %conv.us.i.1339.i
+  %add9.us.i.1343.i = add nsw i32 %mul7.us.i.1342.i, %add9.us.i.i
+  %inc.us.i.1344.i = or i32 %j.026.us.i.i, 2
+  %add.us.i.1.1.i = add i32 %inc.us.i.1.i, %mul.us.i118.1.i
+  %arrayidx4.us.i.1.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.1.i
+  %B7 = load i16, i16* %arrayidx4.us.i.1.1.i, align 2
+  %conv.us.i.1.1.i = sext i16 %B7 to i32
+  %arrayidx5.us.i.1.1.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1.i
+  %B6.dup = load i16, i16* %arrayidx5.us.i.1.1.i, align 2
+  %conv6.us.i.1.1.i = sext i16 %B6.dup to i32
+  %mul7.us.i.1.1.i = mul nsw i32 %conv6.us.i.1.1.i, %conv.us.i.1.1.i
+  %add9.us.i.1.1.i = add nsw i32 %mul7.us.i.1.1.i, %add9.us.i.1.i
+  %inc.us.i.1.1.i = or i32 %j.026.us.i.i, 2
+  %add.us.i.2.1.i = add i32 %inc.us.i.2.i, %mul.us.i118.2.i
+  %arrayidx4.us.i.2.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.1.i
+  %B9 = load i16, i16* %arrayidx4.us.i.2.1.i, align 2
+  %conv.us.i.2.1.i = sext i16 %B9 to i32
+  %arrayidx5.us.i.2.1.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2.i
+  %B6.dup.i = load i16, i16* %arrayidx5.us.i.2.1.i, align 2
+  %conv6.us.i.2.1.i = sext i16 %B6.dup.i to i32
+  %mul7.us.i.2.1.i = mul nsw i32 %conv6.us.i.2.1.i, %conv.us.i.2.1.i
+  %add9.us.i.2.1.i = add nsw i32 %mul7.us.i.2.1.i, %add9.us.i.2.i
+  %inc.us.i.2.1.i = or i32 %j.026.us.i.i, 2
+  %add.us.i.3.1.i = add i32 %inc.us.i.3.i, %mul.us.i118.3.i
+  %arrayidx4.us.i.3.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.1.i
+  %B11 = load i16, i16* %arrayidx4.us.i.3.1.i, align 2
+  %conv.us.i.3.1.i = sext i16 %B11 to i32
+  %arrayidx5.us.i.3.1.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.3.i
+  %B6.dup.i.i = load i16, i16* %arrayidx5.us.i.3.1.i, align 2
+  %conv6.us.i.3.1.i = sext i16 %B6.dup.i.i to i32
+  %mul7.us.i.3.1.i = mul nsw i32 %conv6.us.i.3.1.i, %conv.us.i.3.1.i
+  %add9.us.i.3.1.i = add nsw i32 %mul7.us.i.3.1.i, %add9.us.i.3.i
+  %inc.us.i.3.1.i = or i32 %j.026.us.i.i, 2
+  %add.us.i.2346.i = add i32 %inc.us.i.1344.i, %mul.us.i118.i
+  %arrayidx4.us.i.2347.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2346.i
+  %B13 = load i16, i16* %arrayidx4.us.i.2347.i, align 2
+  %conv.us.i.2348.i = sext i16 %B13 to i32
+  %arrayidx5.us.i.2349.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1344.i
+  %B14 = load i16, i16* %arrayidx5.us.i.2349.i, align 2
+  %conv6.us.i.2350.i = sext i16 %B14 to i32
+  %mul7.us.i.2351.i = mul nsw i32 %conv6.us.i.2350.i, %conv.us.i.2348.i
+  %add9.us.i.2352.i = add nsw i32 %mul7.us.i.2351.i, %add9.us.i.1343.i
+  %inc.us.i.2353.i = or i32 %j.026.us.i.i, 3
+  %add.us.i.1.2.i = add i32 %inc.us.i.1.1.i, %mul.us.i118.1.i
+  %arrayidx4.us.i.1.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.2.i
+  %B15 = load i16, i16* %arrayidx4.us.i.1.2.i, align 2
+  %conv.us.i.1.2.i = sext i16 %B15 to i32
+  %arrayidx5.us.i.1.2.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1.1.i
+  %B14.dup = load i16, i16* %arrayidx5.us.i.1.2.i, align 2
+  %conv6.us.i.1.2.i = sext i16 %B14.dup to i32
+  %mul7.us.i.1.2.i = mul nsw i32 %conv6.us.i.1.2.i, %conv.us.i.1.2.i
+  %add9.us.i.1.2.i = add nsw i32 %mul7.us.i.1.2.i, %add9.us.i.1.1.i
+  %inc.us.i.1.2.i = or i32 %j.026.us.i.i, 3
+  %add.us.i.2.2.i = add i32 %inc.us.i.2.1.i, %mul.us.i118.2.i
+  %arrayidx4.us.i.2.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.2.i
+  %B17 = load i16, i16* %arrayidx4.us.i.2.2.i, align 2
+  %conv.us.i.2.2.i = sext i16 %B17 to i32
+  %arrayidx5.us.i.2.2.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2.1.i
+  %B14.dup.i = load i16, i16* %arrayidx5.us.i.2.2.i, align 2
+  %conv6.us.i.2.2.i = sext i16 %B14.dup.i to i32
+  %mul7.us.i.2.2.i = mul nsw i32 %conv6.us.i.2.2.i, %conv.us.i.2.2.i
+  %add9.us.i.2.2.i = add nsw i32 %mul7.us.i.2.2.i, %add9.us.i.2.1.i
+  %inc.us.i.2.2.i = or i32 %j.026.us.i.i, 3
+  %add.us.i.3.2.i = add i32 %inc.us.i.3.1.i, %mul.us.i118.3.i
+  %arrayidx4.us.i.3.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.2.i
+  %B19 = load i16, i16* %arrayidx4.us.i.3.2.i, align 2
+  %conv.us.i.3.2.i = sext i16 %B19 to i32
+  %arrayidx5.us.i.3.2.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.3.1.i
+  %B14.dup.i.i = load i16, i16* %arrayidx5.us.i.3.2.i, align 2
+  %conv6.us.i.3.2.i = sext i16 %B14.dup.i.i to i32
+  %mul7.us.i.3.2.i = mul nsw i32 %conv6.us.i.3.2.i, %conv.us.i.3.2.i
+  %add9.us.i.3.2.i = add nsw i32 %mul7.us.i.3.2.i, %add9.us.i.3.1.i
+  %inc.us.i.3.2.i = or i32 %j.026.us.i.i, 3
+  %add.us.i.3355.i = add i32 %inc.us.i.2353.i, %mul.us.i118.i
+  %arrayidx4.us.i.3356.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3355.i
+  %B21 = load i16, i16* %arrayidx4.us.i.3356.i, align 2
+  %conv.us.i.3357.i = sext i16 %B21 to i32
+  %arrayidx5.us.i.3358.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2353.i
+  %B22 = load i16, i16* %arrayidx5.us.i.3358.i, align 2
+  %conv6.us.i.3359.i = sext i16 %B22 to i32
+  %mul7.us.i.3360.i = mul nsw i32 %conv6.us.i.3359.i, %conv.us.i.3357.i
+  %add9.us.i.3361.i = add nsw i32 %mul7.us.i.3360.i, %add9.us.i.2352.i
+  %inc.us.i.3362.i = add i32 %j.026.us.i.i, 4
+  %add.us.i.1.3.i = add i32 %inc.us.i.1.2.i, %mul.us.i118.1.i
+  %arrayidx4.us.i.1.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.3.i
+  %B23 = load i16, i16* %arrayidx4.us.i.1.3.i, align 2
+  %conv.us.i.1.3.i = sext i16 %B23 to i32
+  %arrayidx5.us.i.1.3.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1.2.i
+  %B22.dup = load i16, i16* %arrayidx5.us.i.1.3.i, align 2
+  %conv6.us.i.1.3.i = sext i16 %B22.dup to i32
+  %mul7.us.i.1.3.i = mul nsw i32 %conv6.us.i.1.3.i, %conv.us.i.1.3.i
+  %add9.us.i.1.3.i = add nsw i32 %mul7.us.i.1.3.i, %add9.us.i.1.2.i
+  %add.us.i.2.3.i = add i32 %inc.us.i.2.2.i, %mul.us.i118.2.i
+  %arrayidx4.us.i.2.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.3.i
+  %B25 = load i16, i16* %arrayidx4.us.i.2.3.i, align 2
+  %conv.us.i.2.3.i = sext i16 %B25 to i32
+  %arrayidx5.us.i.2.3.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2.2.i
+  %B22.dup.i = load i16, i16* %arrayidx5.us.i.2.3.i, align 2
+  %conv6.us.i.2.3.i = sext i16 %B22.dup.i to i32
+  %mul7.us.i.2.3.i = mul nsw i32 %conv6.us.i.2.3.i, %conv.us.i.2.3.i
+  %add9.us.i.2.3.i = add nsw i32 %mul7.us.i.2.3.i, %add9.us.i.2.2.i
+  %add.us.i.3.3.i = add i32 %inc.us.i.3.2.i, %mul.us.i118.3.i
+  %arrayidx4.us.i.3.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.3.i
+  %B27 = load i16, i16* %arrayidx4.us.i.3.3.i, align 2
+  %conv.us.i.3.3.i = sext i16 %B27 to i32
+  %arrayidx5.us.i.3.3.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.3.2.i
+  %B22.dup.i.i = load i16, i16* %arrayidx5.us.i.3.3.i, align 2
+  %conv6.us.i.3.3.i = sext i16 %B22.dup.i.i to i32
+  %mul7.us.i.3.3.i = mul nsw i32 %conv6.us.i.3.3.i, %conv.us.i.3.3.i
+  %add9.us.i.3.3.i = add nsw i32 %mul7.us.i.3.3.i, %add9.us.i.3.2.i
+  %niter335.nsub.3.i = add i32 %niter335.i, -4
+  %niter335.ncmp.3.i = icmp eq i32 %niter335.nsub.3.i, 0
+  br i1 %niter335.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  %arrayidx.out.i = getelementptr inbounds i32, i32* %res, i32 0
+  store i32 %add9.us.i.3361.i, i32* %arrayidx.out.i, align 4
+  %arrayidx.out.1.i = getelementptr inbounds i32, i32* %res, i32 1
+  store i32 %add9.us.i.1.3.i, i32* %arrayidx.out.1.i, align 4
+  %arrayidx.out.2.i = getelementptr inbounds i32, i32* %res, i32 2
+  store i32 %add9.us.i.2.3.i, i32* %arrayidx.out.2.i, align 4
+  %arrayidx.out.3.i = getelementptr inbounds i32, i32* %res, i32 3
+  store i32 %add9.us.i.3.3.i, i32* %arrayidx.out.3.i, align 4
+  ret void
+}




More information about the llvm-commits mailing list