[llvm] r342210 - [ARM] bottom-top mul support in ARMParallelDSP

Mon Sep 17 00:31:43 PDT 2018

Thanks Reid, I'll look into it.


Sam Parker

Compilation Tools Engineer | Arm

. . . . . . . . . . . . . . . . . . . . . . . . . . .

Arm.com

________________________________
From: Reid Kleckner <rnk at google.com>
Sent: 14 September 2018 20:31:01
To: Sam Parker
Cc: llvm-commits
Subject: Re: [llvm] r342210 - [ARM] bottom-top mul support in ARMParallelDSP


$ cat convolver.cpp
int a;
char *b;
char c;
short *d;
void h() {
  for (;;) {
    int e, f, g;
    for (; g < 1; ++g, f += a)
      e += d[g] * b[f];
    c = e;
  }
}

$ ./bin/clang++ -cc1 -triple thumbv7-unknown-linux-android -emit-obj   -O2  -vectorize-loops -vectorize-slp -x c++   convolver.cpp
clang++: /usr/local/google/home/rnk/llvm-project/llvm/include/llvm/Support/Casting.h:255: typename cast_retty<X, Y *>::ret_type llvm::cast(Y *) [X = llvm::IntegerType, Y = const llvm::Type]: Assertion `isa<X>(Val) && "cast<Ty>() argument of incompatible type!"' failed.

On Fri, Sep 14, 2018 at 11:46 AM Reid Kleckner <rnk at google.com<mailto:rnk at google.com>> wrote:
This caused assertion failures while building Chromium for Android, so I have reverted it and am starting a reduction:
https://ci.chromium.org/buildbot/chromium.clang/ToTAndroid/4550

On Fri, Sep 14, 2018 at 1:10 AM Sam Parker via llvm-commits <llvm-commits at lists.llvm.org<mailto:llvm-commits at lists.llvm.org>> wrote:
Author: sam_parker
Date: Fri Sep 14 01:09:09 2018
New Revision: 342210

URL: http://llvm.org/viewvc/llvm-project?rev=342210&view=rev
Log:
[ARM] bottom-top mul support in ARMParallelDSP

On failing to find sequences that can be converted into dual macs,
try to find sequential 16-bit loads that are used by muls which we
can then use smultb, smulbt, smultt with a wide load.

Differential Revision: https://reviews.llvm.org/D51983

Added:
    llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom-neg.ll
    llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom.ll
Modified:
    llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp

Modified: llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp?rev=342210&r1=342209&r2=342210&view=diff
==============================================================================

--- llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp Fri Sep 14 01:09:09 2018
@@ -55,6 +55,7 @@ namespace {
   using ReductionList   = SmallVector<Reduction, 8>;
   using ValueList       = SmallVector<Value*, 8>;
   using MemInstList     = SmallVector<Instruction*, 8>;
+  using LoadInstList    = SmallVector<LoadInst*, 8>;
   using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
   using PMACPairList    = SmallVector<PMACPair, 8>;
   using Instructions    = SmallVector<Instruction*,16>;
@@ -63,7 +64,8 @@ namespace {
   struct OpChain {
     Instruction   *Root;
     ValueList     AllValues;
-    MemInstList   VecLd;    // List of all load instructions.
+    MemInstList   VecLd;    // List of all sequential load instructions.
+    LoadInstList  Loads;    // List of all load instructions.
     MemLocList    MemLocs;  // All memory locations read by this tree.
     bool          ReadOnly = true;

@@ -76,8 +78,10 @@ namespace {
         if (auto *I = dyn_cast<Instruction>(V)) {
           if (I->mayWriteToMemory())
             ReadOnly = false;
-          if (auto *Ld = dyn_cast<LoadInst>(V))
+          if (auto *Ld = dyn_cast<LoadInst>(V)) {
             MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
+            Loads.push_back(Ld);
+          }
         }
       }
     }
@@ -135,6 +139,7 @@ namespace {
     /// exchange the halfwords of the second operand before performing the
     /// arithmetic.
     bool MatchSMLAD(Function &F);
+    bool MatchTopBottomMuls(BasicBlock *LoopBody);

   public:
     static char ID;
@@ -203,6 +208,8 @@ namespace {
       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
       Changes = MatchSMLAD(F);
+      if (!Changes)
+        Changes = MatchTopBottomMuls(Header);
       return Changes;
     }
   };
@@ -496,10 +503,10 @@ static void MatchReductions(Function &F,
   );
 }

-static void AddMACCandidate(OpChainList &Candidates,
+static void AddMulCandidate(OpChainList &Candidates,
                             Instruction *Mul,
                             Value *MulOp0, Value *MulOp1) {
-  LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
+  LLVM_DEBUG(dbgs() << "OK, found mul:\t"; Mul->dump());
   assert(Mul->getOpcode() == Instruction::Mul &&
          "expected mul instruction");
   ValueList LHS;
@@ -533,14 +540,14 @@ static void MatchParallelMACSequences(Re
       break;
     case Instruction::Mul:
       if (match (I, (m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
-        AddMACCandidate(Candidates, I, MulOp0, MulOp1);
+        AddMulCandidate(Candidates, I, MulOp0, MulOp1);
         return false;
       }
       break;
     case Instruction::SExt:
       if (match (I, (m_SExt(m_Mul(m_Value(MulOp0), m_Value(MulOp1)))))) {
         Instruction *Mul = cast<Instruction>(I->getOperand(0));
-        AddMACCandidate(Candidates, Mul, MulOp0, MulOp1);
+        AddMulCandidate(Candidates, Mul, MulOp0, MulOp1);
         return false;
       }
       break;
@@ -569,23 +576,24 @@ static void AliasCandidates(BasicBlock *
 // the memory locations accessed by the MAC-chains.
 // TODO: we need the read statements when we accept more complicated chains.
 static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
-                       Instructions &Writes, OpChainList &MACCandidates) {
+                       Instructions &Writes, OpChainList &Candidates) {
   LLVM_DEBUG(dbgs() << "Alias checks:\n");
-  for (auto &MAC : MACCandidates) {
-    LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
+  for (auto &Candidate : Candidates) {
+    LLVM_DEBUG(dbgs() << "mul: "; Candidate->Root->dump());
+    Candidate->SetMemoryLocations();

     // At the moment, we allow only simple chains that only consist of reads,
     // accumulate their result with an integer add, and thus that don't write
     // memory, and simply bail if they do.
-    if (!MAC->ReadOnly)
+    if (!Candidate->ReadOnly)
       return true;

     // Now for all writes in the basic block, check that they don't alias with
     // the memory locations accessed by our MAC-chain:
     for (auto *I : Writes) {
       LLVM_DEBUG(dbgs() << "- "; I->dump());
-      assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
-      for (auto &MemLoc : MAC->MemLocs) {
+      assert(Candidate->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
+      for (auto &MemLoc : Candidate->MemLocs) {
         if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
                                           ModRefInfo::ModRef))) {
           LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
@@ -599,7 +607,7 @@ static bool AreAliased(AliasAnalysis *AA
   return false;
 }

-static bool CheckMACMemory(OpChainList &Candidates) {
+static bool CheckMulMemory(OpChainList &Candidates) {
   for (auto &C : Candidates) {
     // A mul has 2 operands, and a narrow op consist of sext and a load; thus
     // we expect at least 4 items in this operand value list.
@@ -607,7 +615,6 @@ static bool CheckMACMemory(OpChainList &
       LLVM_DEBUG(dbgs() << "Operand list too short.\n");
       return false;
     }
-    C->SetMemoryLocations();
     ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
     ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;

@@ -620,6 +627,131 @@ static bool CheckMACMemory(OpChainList &
   return true;
 }

+static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst *BaseLoad,
+                               const Type *LoadTy) {
+  const unsigned AddrSpace = BaseLoad->getPointerAddressSpace();
+
+  Value *VecPtr = IRB.CreateBitCast(BaseLoad->getPointerOperand(),
+                                     LoadTy->getPointerTo(AddrSpace));
+  return IRB.CreateAlignedLoad(VecPtr, BaseLoad->getAlignment());
+}
+
+/// Attempt to widen loads and use smulbb, smulbt, smultb and smultt muls.
+// TODO: This, like smlad generation, expects the leave operands to be loads
+// that are sign extended. We should be able to handle scalar values as well
+// performing these muls on word x half types to generate smulwb and smulwt.
+bool ARMParallelDSP::MatchTopBottomMuls(BasicBlock *LoopBody) {
+  LLVM_DEBUG(dbgs() << "Attempting to find BT|TB muls.\n");
+
+  OpChainList Candidates;
+  for (auto &I : *LoopBody) {
+    if (I.getOpcode() == Instruction::Mul) {
+      if (I.getType()->getScalarSizeInBits() == 32 ||
+          I.getType()->getScalarSizeInBits() == 64)
+      AddMulCandidate(Candidates, &I, I.getOperand(0), I.getOperand(1));
+    }
+  }
+
+  if (Candidates.empty())
+    return false;
+
+  Instructions Reads;
+  Instructions Writes;
+  AliasCandidates(LoopBody, Reads, Writes);
+
+  if (AreAliased(AA, Reads, Writes, Candidates))
+    return false;
+
+  DenseMap<LoadInst*, Instruction*> LoadUsers;
+  DenseMap<LoadInst*, LoadInst*> SeqLoads;
+  SmallPtrSet<LoadInst*, 8> OffsetLoads;
+
+  for (unsigned i = 0; i < Candidates.size(); ++i) {
+    for (unsigned j = 0; j < Candidates.size(); ++j) {
+      if (i == j)
+        continue;
+
+      OpChain *MulChain0 = Candidates[i].get();
+      OpChain *MulChain1 = Candidates[j].get();
+
+      for (auto *Ld0 : MulChain0->Loads) {
+        if (SeqLoads.count(Ld0) || OffsetLoads.count(Ld0))
+          continue;
+
+        for (auto *Ld1 : MulChain1->Loads) {
+          if (SeqLoads.count(Ld1) || OffsetLoads.count(Ld1))
+            continue;
+
+          MemInstList VecMem;
+          if (AreSequentialLoads(Ld0, Ld1, VecMem)) {
+            SeqLoads[Ld0] = Ld1;
+            OffsetLoads.insert(Ld1);
+            LoadUsers[Ld0] = MulChain0->Root;
+            LoadUsers[Ld1] = MulChain1->Root;
+          }
+        }
+      }
+    }
+  }
+
+  if (SeqLoads.empty())
+    return false;
+
+  IRBuilder<NoFolder> IRB(LoopBody);
+  const Type *Ty = IntegerType::get(M->getContext(), 32);
+
+  // We know that at least one of the operands is a SExt of Ld.
+  auto GetSExt = [](Instruction *I, LoadInst *Ld, unsigned OpIdx) -> Instruction* {
+    if (!isa<Instruction>(I->getOperand(OpIdx)))
+      return nullptr;
+
+    Value *SExt = nullptr;
+    if (cast<Instruction>(I->getOperand(OpIdx))->getOperand(0) == Ld)
+      SExt = I->getOperand(0);
+    else
+      SExt = I->getOperand(1);
+
+    return cast<Instruction>(SExt);
+  };
+
+  LLVM_DEBUG(dbgs() << "Found some sequential loads, now widening:\n");
+  for (auto &Pair : SeqLoads) {
+    LoadInst *BaseLd = Pair.first;
+    LoadInst *OffsetLd = Pair.second;
+    IRB.SetInsertPoint(BaseLd);
+    LoadInst *WideLd = CreateLoadIns(IRB, BaseLd, Ty);
+    LLVM_DEBUG(dbgs() << " - with base load: " << *BaseLd << "\n");
+    LLVM_DEBUG(dbgs() << " - created wide load: " << *WideLd << "\n");
+    Instruction *BaseUser = LoadUsers[BaseLd];
+    Instruction *OffsetUser = LoadUsers[OffsetLd];
+
+    Instruction *BaseSExt = GetSExt(BaseUser, BaseLd, 0);
+    if (!BaseSExt)
+      BaseSExt = GetSExt(BaseUser, BaseLd, 1);
+    Instruction *OffsetSExt = GetSExt(OffsetUser, OffsetLd, 0);
+    if (!OffsetSExt)
+      OffsetSExt = GetSExt(OffsetUser, OffsetLd, 1);
+
+    assert((BaseSExt && OffsetSExt) && "failed to find SExts");
+
+    // BaseUser needs to: (asr (shl WideLoad, 16), 16)
+    // OffsetUser needs to: (asr WideLoad, 16)
+    auto *Shl = cast<Instruction>(IRB.CreateShl(WideLd, 16));
+    auto *Bottom = cast<Instruction>(IRB.CreateAShr(Shl, 16));
+    auto *Top = cast<Instruction>(IRB.CreateAShr(WideLd, 16));
+    BaseUser->replaceUsesOfWith(BaseSExt, Bottom);
+    OffsetUser->replaceUsesOfWith(OffsetSExt, Top);
+
+    BaseSExt->eraseFromParent();
+    OffsetSExt->eraseFromParent();
+    BaseLd->eraseFromParent();
+    OffsetLd->eraseFromParent();
+  }
+  LLVM_DEBUG(dbgs() << "Block after top bottom mul replacements:\n"
+             << *LoopBody << "\n");
+  return true;
+}
+
 // Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
 // multiplications.
 // To use SMLAD:
@@ -658,14 +790,15 @@ bool ARMParallelDSP::MatchSMLAD(Function
              dbgs() << "Header block:\n"; Header->dump();
              dbgs() << "Loop info:\n\n"; L->dump());

-  bool Changed = false;
   ReductionList Reductions;
   MatchReductions(F, L, Header, Reductions);
+  if (Reductions.empty())
+    return false;

   for (auto &R : Reductions) {
     OpChainList MACCandidates;
     MatchParallelMACSequences(R, MACCandidates);
-    if (!CheckMACMemory(MACCandidates))
+    if (!CheckMulMemory(MACCandidates))
       continue;

     R.MACCandidates = std::move(MACCandidates);
@@ -682,6 +815,7 @@ bool ARMParallelDSP::MatchSMLAD(Function
   Instructions Reads, Writes;
   AliasCandidates(Header, Reads, Writes);

+  bool Changed = false;
   for (auto &R : Reductions) {
     if (AreAliased(AA, Reads, Writes, R.MACCandidates))
       return false;
@@ -693,15 +827,6 @@ bool ARMParallelDSP::MatchSMLAD(Function
   return Changed;
 }

-static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst &BaseLoad,
-                               const Type *LoadTy) {
-  const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
-
-  Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
-                                    LoadTy->getPointerTo(AddrSpace));
-  return IRB.CreateAlignedLoad(VecPtr, BaseLoad.getAlignment());
-}
-
 Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
                                              Instruction *Acc, bool Exchange,
                                              Instruction *InsertAfter) {
@@ -716,8 +841,8 @@ Instruction *ARMParallelDSP::CreateSMLAD

   // Replace the reduction chain with an intrinsic call
   const Type *Ty = IntegerType::get(M->getContext(), 32);
-  LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
-  LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
+  LoadInst *NewLd0 = CreateLoadIns(Builder, &VecLd0[0], Ty);
+  LoadInst *NewLd1 = CreateLoadIns(Builder, &VecLd1[0], Ty);
   Value* Args[] = { NewLd0, NewLd1, Acc };
   Function *SMLAD = nullptr;
   if (Exchange)

Added: llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom-neg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom-neg.ll?rev=342210&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom-neg.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom-neg.ll Fri Sep 14 01:09:09 2018
@@ -0,0 +1,209 @@
+; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s
+
+; CHECK-LABEL: topbottom_mul_alias
+; CHECK-NOT: bitcast i16*
+define void @topbottom_mul_alias(i32 %N, i32* nocapture readnone %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
+  %In1.0 = load i16, i16* %PIn1.0, align 2
+  %SIn1.0 = sext i16 %In1.0 to i32
+  %PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
+  %In2.0 = load i16, i16* %PIn2.0, align 2
+  %SIn2.0 = sext i16 %In2.0 to i32
+  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
+  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
+  store i32 %mul5.us.i.i, i32* %Out.0, align 4
+  %iv.1 = or i32 %iv, 1
+  %PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
+  %In1.1 = load i16, i16* %PIn1.1, align 2
+  %SIn1.1 = sext i16 %In1.1 to i32
+  %PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
+  %In2.1 = load i16, i16* %PIn2.1, align 2
+  %SIn2.1 = sext i16 %In2.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
+  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
+  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
+  %In1.2 = load i16, i16* %PIn1.2, align 2
+  %SIn1.2 = sext i16 %In1.2 to i32
+  %PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
+  %In2.2 = load i16, i16* %PIn2.2, align 2
+  %SIn2.2 = sext i16 %In2.2 to i32
+  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
+  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
+  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
+  %iv.3 = or i32 %iv, 3
+  %PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
+  %In1.3 = load i16, i16* %PIn1.3, align 2
+  %SIn1.3 = sext i16 %In1.3 to i32
+  %PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
+  %In2.3 = load i16, i16* %PIn2.3, align 2
+  %SIn2.3 = sext i16 %In2.3 to i32
+  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
+  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
+  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
+  %iv.next = add i32 %iv, 4
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; TODO: We should be able to handle this by splatting the const value.
+; CHECK-LABEL: topbottom_mul_const
+; CHECK-NOT: bitcast i16*
+define void @topbottom_mul_const(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In, i16 signext %const) {
+entry:
+  %conv4.i.i = sext i16 %const to i32
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
+  %In.0 = load i16, i16* %PIn.0, align 2
+  %conv.us.i144.i = sext i16 %In.0 to i32
+  %mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %conv4.i.i
+  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
+  store i32 %mul5.us.i.i, i32* %Out.0, align 4
+  %iv.1 = or i32 %iv, 1
+  %PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
+  %In.1 = load i16, i16* %PIn.1, align 2
+  %conv.us.i144.1.i = sext i16 %In.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %conv4.i.i
+  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
+  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
+  %In.3 = load i16, i16* %PIn.2, align 2
+  %conv.us.i144.2.i = sext i16 %In.3 to i32
+  %mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %conv4.i.i
+  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
+  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
+  %iv.3 = or i32 %iv, 3
+  %PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
+  %In.4 = load i16, i16* %PIn.3, align 2
+  %conv.us.i144.3.i = sext i16 %In.4 to i32
+  %mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %conv4.i.i
+  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
+  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
+  %iv.next = add i32 %iv, 4
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; TODO: We should be able to handle this and use smulwt and smulwb.
+; CHECK-LABEL: topbottom_mul_word_load_const
+; CHECK-NOT: bitcast i16*
+define void @topbottom_mul_word_load_const(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In, i32* %C) {
+entry:
+  %const = load i32, i32* %C
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
+  %In.0 = load i16, i16* %PIn.0, align 2
+  %conv.us.i144.i = sext i16 %In.0 to i32
+  %mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %const
+  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
+  store i32 %mul5.us.i.i, i32* %Out.0, align 4
+  %iv.1 = or i32 %iv, 1
+  %PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
+  %In.1 = load i16, i16* %PIn.1, align 2
+  %conv.us.i144.1.i = sext i16 %In.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %const
+  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
+  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
+  %In.3 = load i16, i16* %PIn.2, align 2
+  %conv.us.i144.2.i = sext i16 %In.3 to i32
+  %mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %const
+  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
+  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
+  %iv.3 = or i32 %iv, 3
+  %PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
+  %In.4 = load i16, i16* %PIn.3, align 2
+  %conv.us.i144.3.i = sext i16 %In.4 to i32
+  %mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %const
+  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
+  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
+  %iv.next = add i32 %iv, 4
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: topbottom_mul_8
+; CHECK-NOT: bitcast i16*
+define void @topbottom_mul_8(i32 %N, i32* noalias nocapture readnone %Out, i8* nocapture readonly %In1, i8* nocapture readonly %In2) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn1.0 = getelementptr inbounds i8, i8* %In1, i32 %iv
+  %In1.0 = load i8, i8* %PIn1.0, align 1
+  %SIn1.0 = sext i8 %In1.0 to i32
+  %PIn2.0 = getelementptr inbounds i8, i8* %In2, i32 %iv
+  %In2.0 = load i8, i8* %PIn2.0, align 1
+  %SIn2.0 = sext i8 %In2.0 to i32
+  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
+  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
+  store i32 %mul5.us.i.i, i32* %Out.0, align 4
+  %iv.1 = or i32 %iv, 1
+  %PIn1.1 = getelementptr inbounds i8, i8* %In1, i32 %iv.1
+  %In1.1 = load i8, i8* %PIn1.1, align 1
+  %SIn1.1 = sext i8 %In1.1 to i32
+  %PIn2.1 = getelementptr inbounds i8, i8* %In2, i32 %iv.1
+  %In2.1 = load i8, i8* %PIn2.1, align 1
+  %SIn2.1 = sext i8 %In2.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
+  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
+  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn1.2 = getelementptr inbounds i8, i8* %In1, i32 %iv.2
+  %In1.2 = load i8, i8* %PIn1.2, align 1
+  %SIn1.2 = sext i8 %In1.2 to i32
+  %PIn2.2 = getelementptr inbounds i8, i8* %In2, i32 %iv.2
+  %In2.2 = load i8, i8* %PIn2.2, align 1
+  %SIn2.2 = sext i8 %In2.2 to i32
+  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
+  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
+  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
+  %iv.3 = or i32 %iv, 3
+  %PIn1.3 = getelementptr inbounds i8, i8* %In1, i32 %iv.3
+  %In1.3 = load i8, i8* %PIn1.3, align 1
+  %SIn1.3 = sext i8 %In1.3 to i32
+  %PIn2.3 = getelementptr inbounds i8, i8* %In2, i32 %iv.3
+  %In2.3 = load i8, i8* %PIn2.3, align 1
+  %SIn2.3 = sext i8 %In2.3 to i32
+  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
+  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
+  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
+  %iv.next = add i32 %iv, 4
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom.ll?rev=342210&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom.ll Fri Sep 14 01:09:09 2018
@@ -0,0 +1,251 @@
+; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s
+
+; CHECK-LABEL: topbottom_mul
+define void @topbottom_mul(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
+entry:
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: [[Cast_PIn1_0:%[^ ]+]] = bitcast i16* %PIn1.0 to i32*
+; CHECK: [[PIn1_01:%[^ ]+]] = load i32, i32* [[Cast_PIn1_0]], align 2
+; CHECK: [[PIn1_01_shl:%[^ ]+]] = shl i32 [[PIn1_01]], 16
+; CHECK: [[PIn1_0:%[^ ]+]] = ashr i32 [[PIn1_01_shl]], 16
+; CHECK: [[PIn1_1:%[^ ]+]] = ashr i32 [[PIn1_01]], 16
+
+; CHECK: [[Cast_PIn2_0:%[^ ]+]] = bitcast i16* %PIn2.0 to i32*
+; CHECK: [[PIn2_01:%[^ ]+]] = load i32, i32* [[Cast_PIn2_0]], align 2
+; CHECK: [[PIn2_01_shl:%[^ ]+]] = shl i32 [[PIn2_01]], 16
+; CHECK: [[PIn2_0:%[^ ]+]] = ashr i32 [[PIn2_01_shl]], 16
+; CHECK: [[PIn2_1:%[^ ]+]] = ashr i32 [[PIn2_01]], 16
+
+; CHECK: mul nsw i32 [[PIn1_0]], [[PIn2_0]]
+; CHECK: mul nsw i32 [[PIn1_1]], [[PIn2_1]]
+
+; CHECK: [[Cast_PIn1_2:%[^ ]+]] = bitcast i16* %PIn1.2 to i32*
+; CHECK: [[PIn1_23:%[^ ]+]] = load i32, i32* [[Cast_PIn1_2]], align 2
+; CHECK: [[PIn1_23_shl:%[^ ]+]] = shl i32 [[PIn1_23]], 16
+; CHECK: [[PIn1_2:%[^ ]+]] = ashr i32 [[PIn1_23_shl]], 16
+; CHECK: [[PIn1_3:%[^ ]+]] = ashr i32 [[PIn1_23]], 16
+
+; CHECK: [[Cast_PIn2_2:%[^ ]+]] = bitcast i16* %PIn2.2 to i32*
+; CHECK: [[PIn2_23:%[^ ]+]] = load i32, i32* [[Cast_PIn2_2]], align 2
+; CHECK: [[PIn2_23_shl:%[^ ]+]] = shl i32 [[PIn2_23]], 16
+; CHECK: [[PIn2_2:%[^ ]+]] = ashr i32 [[PIn2_23_shl]], 16
+; CHECK: [[PIn2_3:%[^ ]+]] = ashr i32 [[PIn2_23]], 16
+
+; CHECK: mul nsw i32 [[PIn1_2]], [[PIn2_2]]
+; CHECK: mul nsw i32 [[PIn1_3]], [[PIn2_3]]
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
+  %In1.0 = load i16, i16* %PIn1.0, align 2
+  %SIn1.0 = sext i16 %In1.0 to i32
+  %PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
+  %In2.0 = load i16, i16* %PIn2.0, align 2
+  %SIn2.0 = sext i16 %In2.0 to i32
+  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
+  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
+  store i32 %mul5.us.i.i, i32* %Out.0, align 4
+  %iv.1 = or i32 %iv, 1
+  %PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
+  %In1.1 = load i16, i16* %PIn1.1, align 2
+  %SIn1.1 = sext i16 %In1.1 to i32
+  %PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
+  %In2.1 = load i16, i16* %PIn2.1, align 2
+  %SIn2.1 = sext i16 %In2.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
+  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
+  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
+  %In1.2 = load i16, i16* %PIn1.2, align 2
+  %SIn1.2 = sext i16 %In1.2 to i32
+  %PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
+  %In2.2 = load i16, i16* %PIn2.2, align 2
+  %SIn2.2 = sext i16 %In2.2 to i32
+  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
+  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
+  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
+  %iv.3 = or i32 %iv, 3
+  %PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
+  %In1.3 = load i16, i16* %PIn1.3, align 2
+  %SIn1.3 = sext i16 %In1.3 to i32
+  %PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
+  %In2.3 = load i16, i16* %PIn2.3, align 2
+  %SIn2.3 = sext i16 %In2.3 to i32
+  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
+  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
+  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
+  %iv.next = add i32 %iv, 4
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: topbottom_mul_load_const
+define void @topbottom_mul_load_const(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In, i16* %C) {
+entry:
+  %const = load i16, i16* %C
+  %conv4.i.i = sext i16 %const to i32
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: [[Cast_PIn_0:%[^ ]+]] = bitcast i16* %PIn.0 to i32*
+; CHECK: [[PIn_01:%[^ ]+]] = load i32, i32* [[Cast_PIn_0]], align 2
+; CHECK: [[PIn_01_shl:%[^ ]+]] = shl i32 [[PIn_01]], 16
+; CHECK: [[PIn_0:%[^ ]+]] = ashr i32 [[PIn_01_shl]], 16
+; CHECK: [[PIn_1:%[^ ]+]] = ashr i32 [[PIn_01]], 16
+
+; CHECK: mul nsw i32 [[PIn_0]], %conv4.i.i
+; CHECK: mul nsw i32 [[PIn_1]], %conv4.i.i
+
+; CHECK: [[Cast_PIn_2:%[^ ]+]] = bitcast i16* %PIn.2 to i32*
+; CHECK: [[PIn_23:%[^ ]+]] = load i32, i32* [[Cast_PIn_2]], align 2
+; CHECK: [[PIn_23_shl:%[^ ]+]] = shl i32 [[PIn_23]], 16
+; CHECK: [[PIn_2:%[^ ]+]] = ashr i32 [[PIn_23_shl]], 16
+; CHECK: [[PIn_3:%[^ ]+]] = ashr i32 [[PIn_23]], 16
+
+; CHECK: mul nsw i32 [[PIn_2]], %conv4.i.i
+; CHECK: mul nsw i32 [[PIn_3]], %conv4.i.i
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
+  %In.0 = load i16, i16* %PIn.0, align 2
+  %conv.us.i144.i = sext i16 %In.0 to i32
+  %mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %conv4.i.i
+  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
+  store i32 %mul5.us.i.i, i32* %Out.0, align 4
+  %iv.1 = or i32 %iv, 1
+  %PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
+  %In.1 = load i16, i16* %PIn.1, align 2
+  %conv.us.i144.1.i = sext i16 %In.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %conv4.i.i
+  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
+  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
+  %In.3 = load i16, i16* %PIn.2, align 2
+  %conv.us.i144.2.i = sext i16 %In.3 to i32
+  %mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %conv4.i.i
+  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
+  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
+  %iv.3 = or i32 %iv, 3
+  %PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
+  %In.4 = load i16, i16* %PIn.3, align 2
+  %conv.us.i144.3.i = sext i16 %In.4 to i32
+  %mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %conv4.i.i
+  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
+  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
+  %iv.next = add i32 %iv, 4
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: topbottom_mul_64
+define void @topbottom_mul_64(i32 %N, i64* noalias nocapture readnone %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
+entry:
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: [[Cast_PIn1_0:%[^ ]+]] = bitcast i16* %PIn1.0 to i32*
+; CHECK: [[PIn1_01:%[^ ]+]] = load i32, i32* [[Cast_PIn1_0]], align 2
+; CHECK: [[PIn1_01_shl:%[^ ]+]] = shl i32 [[PIn1_01]], 16
+; CHECK: [[PIn1_0:%[^ ]+]] = ashr i32 [[PIn1_01_shl]], 16
+; CHECK: [[PIn1_1:%[^ ]+]] = ashr i32 [[PIn1_01]], 16
+
+; CHECK: [[Cast_PIn2_0:%[^ ]+]] = bitcast i16* %PIn2.0 to i32*
+; CHECK: [[PIn2_01:%[^ ]+]] = load i32, i32* [[Cast_PIn2_0]], align 2
+; CHECK: [[PIn2_01_shl:%[^ ]+]] = shl i32 [[PIn2_01]], 16
+; CHECK: [[PIn2_0:%[^ ]+]] = ashr i32 [[PIn2_01_shl]], 16
+; CHECK: [[PIn2_1:%[^ ]+]] = ashr i32 [[PIn2_01]], 16
+
+; CHECK: [[Mul0:%[^ ]+]] = mul nsw i32 [[PIn1_0]], [[PIn2_0]]
+; CHECK: [[SMul0:%[^ ]+]] = sext i32 [[Mul0]] to i64
+; CHECK: [[Mul1:%[^ ]+]] = mul nsw i32 [[PIn1_1]], [[PIn2_1]]
+; CHECK: [[SMul1:%[^ ]+]] = sext i32 [[Mul1]] to i64
+; CHECK: add i64 [[SMul0]], [[SMul1]]
+
+; CHECK: [[Cast_PIn1_2:%[^ ]+]] = bitcast i16* %PIn1.2 to i32*
+; CHECK: [[PIn1_23:%[^ ]+]] = load i32, i32* [[Cast_PIn1_2]], align 2
+; CHECK: [[PIn1_23_shl:%[^ ]+]] = shl i32 [[PIn1_23]], 16
+; CHECK: [[PIn1_2:%[^ ]+]] = ashr i32 [[PIn1_23_shl]], 16
+; CHECK: [[PIn1_3:%[^ ]+]] = ashr i32 [[PIn1_23]], 16
+
+; CHECK: [[Cast_PIn2_2:%[^ ]+]] = bitcast i16* %PIn2.2 to i32*
+; CHECK: [[PIn2_23:%[^ ]+]] = load i32, i32* [[Cast_PIn2_2]], align 2
+; CHECK: [[PIn2_23_shl:%[^ ]+]] = shl i32 [[PIn2_23]], 16
+; CHECK: [[PIn2_2:%[^ ]+]] = ashr i32 [[PIn2_23_shl]], 16
+; CHECK: [[PIn2_3:%[^ ]+]] = ashr i32 [[PIn2_23]], 16
+
+; CHECK: [[Mul2:%[^ ]+]] = mul nsw i32 [[PIn1_2]], [[PIn2_2]]
+; CHECK: [[SMul2:%[^ ]+]] = sext i32 [[Mul2]] to i64
+; CHECK: [[Mul3:%[^ ]+]] = mul nsw i32 [[PIn1_3]], [[PIn2_3]]
+; CHECK: [[SMul3:%[^ ]+]] = sext i32 [[Mul3]] to i64
+; CHECK: add i64 [[SMul2]], [[SMul3]]
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %iv.out = phi i32 [ 0, %entry] , [ %iv.out.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
+  %In1.0 = load i16, i16* %PIn1.0, align 2
+  %SIn1.0 = sext i16 %In1.0 to i32
+  %PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
+  %In2.0 = load i16, i16* %PIn2.0, align 2
+  %SIn2.0 = sext i16 %In2.0 to i32
+  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
+  %sext.0 = sext i32 %mul5.us.i.i to i64
+  %iv.1 = or i32 %iv, 1
+  %PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
+  %In1.1 = load i16, i16* %PIn1.1, align 2
+  %SIn1.1 = sext i16 %In1.1 to i32
+  %PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
+  %In2.1 = load i16, i16* %PIn2.1, align 2
+  %SIn2.1 = sext i16 %In2.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
+  %sext.1 = sext i32 %mul5.us.i.1.i to i64
+  %mac.0 = add i64 %sext.0, %sext.1
+  %Out.0 = getelementptr inbounds i64, i64* %Out, i32 %iv.out
+  store i64 %mac.0, i64* %Out.0, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
+  %In1.2 = load i16, i16* %PIn1.2, align 2
+  %SIn1.2 = sext i16 %In1.2 to i32
+  %PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
+  %In2.2 = load i16, i16* %PIn2.2, align 2
+  %SIn2.2 = sext i16 %In2.2 to i32
+  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
+  %sext.2 = sext i32 %mul5.us.i.2.i to i64
+  %iv.3 = or i32 %iv, 3
+  %PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
+  %In1.3 = load i16, i16* %PIn1.3, align 2
+  %SIn1.3 = sext i16 %In1.3 to i32
+  %PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
+  %In2.3 = load i16, i16* %PIn2.3, align 2
+  %SIn2.3 = sext i16 %In2.3 to i32
+  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
+  %sext.3 = sext i32 %mul5.us.i.3.i to i64
+  %mac.1 = add i64 %sext.2, %sext.3
+  %iv.out.1 = or i32 %iv.out, 1
+  %Out.1 = getelementptr inbounds i64, i64* %Out, i32 %iv.out.1
+  store i64 %mac.1, i64* %Out.1, align 4
+  %iv.next = add i32 %iv, 4
+  %iv.out.next = add i32 %iv.out, 2
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}


_______________________________________________
llvm-commits mailing list
llvm-commits at lists.llvm.org<mailto:llvm-commits at lists.llvm.org>
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180917/1d48eb7b/attachment.html>