[llvm] r342210 - [ARM] bottom-top mul support in ARMParallelDSP

Fri Sep 14 12:31:01 PDT 2018

$ cat convolver.cpp
int a;
char *b;
char c;
short *d;
void h() {
  for (;;) {
    int e, f, g;
    for (; g < 1; ++g, f += a)
      e += d[g] * b[f];
    c = e;
  }
}

$ ./bin/clang++ -cc1 -triple thumbv7-unknown-linux-android -emit-obj
-O2  -vectorize-loops -vectorize-slp -x c++   convolver.cpp
clang++: /usr/local/google/home/rnk/llvm-project/llvm/include/llvm/Support/Casting.h:255:
typename cast_retty<X, Y *>::ret_type llvm::cast(Y *) [X =
llvm::IntegerType, Y = const llvm::Type]: Assertion `isa<X>(Val) &&
"cast<Ty>() argument of incompatible type!"' failed.

On Fri, Sep 14, 2018 at 11:46 AM Reid Kleckner <rnk at google.com> wrote:

> This caused assertion failures while building Chromium for Android, so I
> have reverted it and am starting a reduction:
> https://ci.chromium.org/buildbot/chromium.clang/ToTAndroid/4550
>
> On Fri, Sep 14, 2018 at 1:10 AM Sam Parker via llvm-commits <
> llvm-commits at lists.llvm.org> wrote:
>
>> Author: sam_parker
>> Date: Fri Sep 14 01:09:09 2018
>> New Revision: 342210
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=342210&view=rev
>> Log:
>> [ARM] bottom-top mul support in ARMParallelDSP
>>
>> On failing to find sequences that can be converted into dual macs,
>> try to find sequential 16-bit loads that are used by muls which we
>> can then use smultb, smulbt, smultt with a wide load.
>>
>> Differential Revision: https://reviews.llvm.org/D51983
>>
>> Added:
>>     llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom-neg.ll
>>     llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom.ll
>> Modified:
>>     llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
>>
>> Modified: llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp?rev=342210&r1=342209&r2=342210&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp (original)
>> +++ llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp Fri Sep 14 01:09:09 2018
>> @@ -55,6 +55,7 @@ namespace {
>>    using ReductionList   = SmallVector<Reduction, 8>;
>>    using ValueList       = SmallVector<Value*, 8>;
>>    using MemInstList     = SmallVector<Instruction*, 8>;
>> +  using LoadInstList    = SmallVector<LoadInst*, 8>;
>>    using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
>>    using PMACPairList    = SmallVector<PMACPair, 8>;
>>    using Instructions    = SmallVector<Instruction*,16>;
>> @@ -63,7 +64,8 @@ namespace {
>>    struct OpChain {
>>      Instruction   *Root;
>>      ValueList     AllValues;
>> -    MemInstList   VecLd;    // List of all load instructions.
>> +    MemInstList   VecLd;    // List of all sequential load instructions.
>> +    LoadInstList  Loads;    // List of all load instructions.
>>      MemLocList    MemLocs;  // All memory locations read by this tree.
>>      bool          ReadOnly = true;
>>
>> @@ -76,8 +78,10 @@ namespace {
>>          if (auto *I = dyn_cast<Instruction>(V)) {
>>            if (I->mayWriteToMemory())
>>              ReadOnly = false;
>> -          if (auto *Ld = dyn_cast<LoadInst>(V))
>> +          if (auto *Ld = dyn_cast<LoadInst>(V)) {
>>              MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(),
>> Size));
>> +            Loads.push_back(Ld);
>> +          }
>>          }
>>        }
>>      }
>> @@ -135,6 +139,7 @@ namespace {
>>      /// exchange the halfwords of the second operand before performing
>> the
>>      /// arithmetic.
>>      bool MatchSMLAD(Function &F);
>> +    bool MatchTopBottomMuls(BasicBlock *LoopBody);
>>
>>    public:
>>      static char ID;
>> @@ -203,6 +208,8 @@ namespace {
>>        LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
>>        LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
>>        Changes = MatchSMLAD(F);
>> +      if (!Changes)
>> +        Changes = MatchTopBottomMuls(Header);
>>        return Changes;
>>      }
>>    };
>> @@ -496,10 +503,10 @@ static void MatchReductions(Function &F,
>>    );
>>  }
>>
>> -static void AddMACCandidate(OpChainList &Candidates,
>> +static void AddMulCandidate(OpChainList &Candidates,
>>                              Instruction *Mul,
>>                              Value *MulOp0, Value *MulOp1) {
>> -  LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
>> +  LLVM_DEBUG(dbgs() << "OK, found mul:\t"; Mul->dump());
>>    assert(Mul->getOpcode() == Instruction::Mul &&
>>           "expected mul instruction");
>>    ValueList LHS;
>> @@ -533,14 +540,14 @@ static void MatchParallelMACSequences(Re
>>        break;
>>      case Instruction::Mul:
>>        if (match (I, (m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
>> -        AddMACCandidate(Candidates, I, MulOp0, MulOp1);
>> +        AddMulCandidate(Candidates, I, MulOp0, MulOp1);
>>          return false;
>>        }
>>        break;
>>      case Instruction::SExt:
>>        if (match (I, (m_SExt(m_Mul(m_Value(MulOp0), m_Value(MulOp1)))))) {
>>          Instruction *Mul = cast<Instruction>(I->getOperand(0));
>> -        AddMACCandidate(Candidates, Mul, MulOp0, MulOp1);
>> +        AddMulCandidate(Candidates, Mul, MulOp0, MulOp1);
>>          return false;
>>        }
>>        break;
>> @@ -569,23 +576,24 @@ static void AliasCandidates(BasicBlock *
>>  // the memory locations accessed by the MAC-chains.
>>  // TODO: we need the read statements when we accept more complicated
>> chains.
>>  static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
>> -                       Instructions &Writes, OpChainList &MACCandidates)
>> {
>> +                       Instructions &Writes, OpChainList &Candidates) {
>>    LLVM_DEBUG(dbgs() << "Alias checks:\n");
>> -  for (auto &MAC : MACCandidates) {
>> -    LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
>> +  for (auto &Candidate : Candidates) {
>> +    LLVM_DEBUG(dbgs() << "mul: "; Candidate->Root->dump());
>> +    Candidate->SetMemoryLocations();
>>
>>      // At the moment, we allow only simple chains that only consist of
>> reads,
>>      // accumulate their result with an integer add, and thus that don't
>> write
>>      // memory, and simply bail if they do.
>> -    if (!MAC->ReadOnly)
>> +    if (!Candidate->ReadOnly)
>>        return true;
>>
>>      // Now for all writes in the basic block, check that they don't
>> alias with
>>      // the memory locations accessed by our MAC-chain:
>>      for (auto *I : Writes) {
>>        LLVM_DEBUG(dbgs() << "- "; I->dump());
>> -      assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
>> -      for (auto &MemLoc : MAC->MemLocs) {
>> +      assert(Candidate->MemLocs.size() >= 2 && "expecting at least 2
>> memlocs");
>> +      for (auto &MemLoc : Candidate->MemLocs) {
>>          if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
>>                                            ModRefInfo::ModRef))) {
>>            LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
>> @@ -599,7 +607,7 @@ static bool AreAliased(AliasAnalysis *AA
>>    return false;
>>  }
>>
>> -static bool CheckMACMemory(OpChainList &Candidates) {
>> +static bool CheckMulMemory(OpChainList &Candidates) {
>>    for (auto &C : Candidates) {
>>      // A mul has 2 operands, and a narrow op consist of sext and a load;
>> thus
>>      // we expect at least 4 items in this operand value list.
>> @@ -607,7 +615,6 @@ static bool CheckMACMemory(OpChainList &
>>        LLVM_DEBUG(dbgs() << "Operand list too short.\n");
>>        return false;
>>      }
>> -    C->SetMemoryLocations();
>>      ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
>>      ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
>>
>> @@ -620,6 +627,131 @@ static bool CheckMACMemory(OpChainList &
>>    return true;
>>  }
>>
>> +static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst
>> *BaseLoad,
>> +                               const Type *LoadTy) {
>> +  const unsigned AddrSpace = BaseLoad->getPointerAddressSpace();
>> +
>> +  Value *VecPtr = IRB.CreateBitCast(BaseLoad->getPointerOperand(),
>> +                                     LoadTy->getPointerTo(AddrSpace));
>> +  return IRB.CreateAlignedLoad(VecPtr, BaseLoad->getAlignment());
>> +}
>> +
>> +/// Attempt to widen loads and use smulbb, smulbt, smultb and smultt
>> muls.
>> +// TODO: This, like smlad generation, expects the leave operands to be
>> loads
>> +// that are sign extended. We should be able to handle scalar values as
>> well
>> +// performing these muls on word x half types to generate smulwb and
>> smulwt.
>> +bool ARMParallelDSP::MatchTopBottomMuls(BasicBlock *LoopBody) {
>> +  LLVM_DEBUG(dbgs() << "Attempting to find BT|TB muls.\n");
>> +
>> +  OpChainList Candidates;
>> +  for (auto &I : *LoopBody) {
>> +    if (I.getOpcode() == Instruction::Mul) {
>> +      if (I.getType()->getScalarSizeInBits() == 32 ||
>> +          I.getType()->getScalarSizeInBits() == 64)
>> +      AddMulCandidate(Candidates, &I, I.getOperand(0), I.getOperand(1));
>> +    }
>> +  }
>> +
>> +  if (Candidates.empty())
>> +    return false;
>> +
>> +  Instructions Reads;
>> +  Instructions Writes;
>> +  AliasCandidates(LoopBody, Reads, Writes);
>> +
>> +  if (AreAliased(AA, Reads, Writes, Candidates))
>> +    return false;
>> +
>> +  DenseMap<LoadInst*, Instruction*> LoadUsers;
>> +  DenseMap<LoadInst*, LoadInst*> SeqLoads;
>> +  SmallPtrSet<LoadInst*, 8> OffsetLoads;
>> +
>> +  for (unsigned i = 0; i < Candidates.size(); ++i) {
>> +    for (unsigned j = 0; j < Candidates.size(); ++j) {
>> +      if (i == j)
>> +        continue;
>> +
>> +      OpChain *MulChain0 = Candidates[i].get();
>> +      OpChain *MulChain1 = Candidates[j].get();
>> +
>> +      for (auto *Ld0 : MulChain0->Loads) {
>> +        if (SeqLoads.count(Ld0) || OffsetLoads.count(Ld0))
>> +          continue;
>> +
>> +        for (auto *Ld1 : MulChain1->Loads) {
>> +          if (SeqLoads.count(Ld1) || OffsetLoads.count(Ld1))
>> +            continue;
>> +
>> +          MemInstList VecMem;
>> +          if (AreSequentialLoads(Ld0, Ld1, VecMem)) {
>> +            SeqLoads[Ld0] = Ld1;
>> +            OffsetLoads.insert(Ld1);
>> +            LoadUsers[Ld0] = MulChain0->Root;
>> +            LoadUsers[Ld1] = MulChain1->Root;
>> +          }
>> +        }
>> +      }
>> +    }
>> +  }
>> +
>> +  if (SeqLoads.empty())
>> +    return false;
>> +
>> +  IRBuilder<NoFolder> IRB(LoopBody);
>> +  const Type *Ty = IntegerType::get(M->getContext(), 32);
>> +
>> +  // We know that at least one of the operands is a SExt of Ld.
>> +  auto GetSExt = [](Instruction *I, LoadInst *Ld, unsigned OpIdx) ->
>> Instruction* {
>> +    if (!isa<Instruction>(I->getOperand(OpIdx)))
>> +      return nullptr;
>> +
>> +    Value *SExt = nullptr;
>> +    if (cast<Instruction>(I->getOperand(OpIdx))->getOperand(0) == Ld)
>> +      SExt = I->getOperand(0);
>> +    else
>> +      SExt = I->getOperand(1);
>> +
>> +    return cast<Instruction>(SExt);
>> +  };
>> +
>> +  LLVM_DEBUG(dbgs() << "Found some sequential loads, now widening:\n");
>> +  for (auto &Pair : SeqLoads) {
>> +    LoadInst *BaseLd = Pair.first;
>> +    LoadInst *OffsetLd = Pair.second;
>> +    IRB.SetInsertPoint(BaseLd);
>> +    LoadInst *WideLd = CreateLoadIns(IRB, BaseLd, Ty);
>> +    LLVM_DEBUG(dbgs() << " - with base load: " << *BaseLd << "\n");
>> +    LLVM_DEBUG(dbgs() << " - created wide load: " << *WideLd << "\n");
>> +    Instruction *BaseUser = LoadUsers[BaseLd];
>> +    Instruction *OffsetUser = LoadUsers[OffsetLd];
>> +
>> +    Instruction *BaseSExt = GetSExt(BaseUser, BaseLd, 0);
>> +    if (!BaseSExt)
>> +      BaseSExt = GetSExt(BaseUser, BaseLd, 1);
>> +    Instruction *OffsetSExt = GetSExt(OffsetUser, OffsetLd, 0);
>> +    if (!OffsetSExt)
>> +      OffsetSExt = GetSExt(OffsetUser, OffsetLd, 1);
>> +
>> +    assert((BaseSExt && OffsetSExt) && "failed to find SExts");
>> +
>> +    // BaseUser needs to: (asr (shl WideLoad, 16), 16)
>> +    // OffsetUser needs to: (asr WideLoad, 16)
>> +    auto *Shl = cast<Instruction>(IRB.CreateShl(WideLd, 16));
>> +    auto *Bottom = cast<Instruction>(IRB.CreateAShr(Shl, 16));
>> +    auto *Top = cast<Instruction>(IRB.CreateAShr(WideLd, 16));
>> +    BaseUser->replaceUsesOfWith(BaseSExt, Bottom);
>> +    OffsetUser->replaceUsesOfWith(OffsetSExt, Top);
>> +
>> +    BaseSExt->eraseFromParent();
>> +    OffsetSExt->eraseFromParent();
>> +    BaseLd->eraseFromParent();
>> +    OffsetLd->eraseFromParent();
>> +  }
>> +  LLVM_DEBUG(dbgs() << "Block after top bottom mul replacements:\n"
>> +             << *LoopBody << "\n");
>> +  return true;
>> +}
>> +
>>  // Loop Pass that needs to identify integer add/sub reductions of 16-bit
>> vector
>>  // multiplications.
>>  // To use SMLAD:
>> @@ -658,14 +790,15 @@ bool ARMParallelDSP::MatchSMLAD(Function
>>               dbgs() << "Header block:\n"; Header->dump();
>>               dbgs() << "Loop info:\n\n"; L->dump());
>>
>> -  bool Changed = false;
>>    ReductionList Reductions;
>>    MatchReductions(F, L, Header, Reductions);
>> +  if (Reductions.empty())
>> +    return false;
>>
>>    for (auto &R : Reductions) {
>>      OpChainList MACCandidates;
>>      MatchParallelMACSequences(R, MACCandidates);
>> -    if (!CheckMACMemory(MACCandidates))
>> +    if (!CheckMulMemory(MACCandidates))
>>        continue;
>>
>>      R.MACCandidates = std::move(MACCandidates);
>> @@ -682,6 +815,7 @@ bool ARMParallelDSP::MatchSMLAD(Function
>>    Instructions Reads, Writes;
>>    AliasCandidates(Header, Reads, Writes);
>>
>> +  bool Changed = false;
>>    for (auto &R : Reductions) {
>>      if (AreAliased(AA, Reads, Writes, R.MACCandidates))
>>        return false;
>> @@ -693,15 +827,6 @@ bool ARMParallelDSP::MatchSMLAD(Function
>>    return Changed;
>>  }
>>
>> -static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst
>> &BaseLoad,
>> -                               const Type *LoadTy) {
>> -  const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
>> -
>> -  Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
>> -                                    LoadTy->getPointerTo(AddrSpace));
>> -  return IRB.CreateAlignedLoad(VecPtr, BaseLoad.getAlignment());
>> -}
>> -
>>  Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst
>> *VecLd1,
>>                                               Instruction *Acc, bool
>> Exchange,
>>                                               Instruction *InsertAfter) {
>> @@ -716,8 +841,8 @@ Instruction *ARMParallelDSP::CreateSMLAD
>>
>>    // Replace the reduction chain with an intrinsic call
>>    const Type *Ty = IntegerType::get(M->getContext(), 32);
>> -  LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
>> -  LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
>> +  LoadInst *NewLd0 = CreateLoadIns(Builder, &VecLd0[0], Ty);
>> +  LoadInst *NewLd1 = CreateLoadIns(Builder, &VecLd1[0], Ty);
>>    Value* Args[] = { NewLd0, NewLd1, Acc };
>>    Function *SMLAD = nullptr;
>>    if (Exchange)
>>
>> Added: llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom-neg.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom-neg.ll?rev=342210&view=auto
>>
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom-neg.ll (added)
>> +++ llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom-neg.ll Fri Sep 14
>> 01:09:09 2018
>> @@ -0,0 +1,209 @@
>> +; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp
>> -S | FileCheck %s
>> +
>> +; CHECK-LABEL: topbottom_mul_alias
>> +; CHECK-NOT: bitcast i16*
>> +define void @topbottom_mul_alias(i32 %N, i32* nocapture readnone %Out,
>> i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
>> +entry:
>> +  br label %for.body
>> +
>> +for.body:
>> +  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
>> +  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
>> +  %PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
>> +  %In1.0 = load i16, i16* %PIn1.0, align 2
>> +  %SIn1.0 = sext i16 %In1.0 to i32
>> +  %PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
>> +  %In2.0 = load i16, i16* %PIn2.0, align 2
>> +  %SIn2.0 = sext i16 %In2.0 to i32
>> +  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
>> +  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
>> +  store i32 %mul5.us.i.i, i32* %Out.0, align 4
>> +  %iv.1 = or i32 %iv, 1
>> +  %PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
>> +  %In1.1 = load i16, i16* %PIn1.1, align 2
>> +  %SIn1.1 = sext i16 %In1.1 to i32
>> +  %PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
>> +  %In2.1 = load i16, i16* %PIn2.1, align 2
>> +  %SIn2.1 = sext i16 %In2.1 to i32
>> +  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
>> +  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
>> +  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
>> +  %iv.2 = or i32 %iv, 2
>> +  %PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
>> +  %In1.2 = load i16, i16* %PIn1.2, align 2
>> +  %SIn1.2 = sext i16 %In1.2 to i32
>> +  %PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
>> +  %In2.2 = load i16, i16* %PIn2.2, align 2
>> +  %SIn2.2 = sext i16 %In2.2 to i32
>> +  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
>> +  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
>> +  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
>> +  %iv.3 = or i32 %iv, 3
>> +  %PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
>> +  %In1.3 = load i16, i16* %PIn1.3, align 2
>> +  %SIn1.3 = sext i16 %In1.3 to i32
>> +  %PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
>> +  %In2.3 = load i16, i16* %PIn2.3, align 2
>> +  %SIn2.3 = sext i16 %In2.3 to i32
>> +  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
>> +  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
>> +  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
>> +  %iv.next = add i32 %iv, 4
>> +  %count.next = add i32 %count, -4
>> +  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
>> +  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
>> +
>> +exit:
>> +  ret void
>> +}
>> +
>> +; TODO: We should be able to handle this by splatting the const value.
>> +; CHECK-LABEL: topbottom_mul_const
>> +; CHECK-NOT: bitcast i16*
>> +define void @topbottom_mul_const(i32 %N, i32* noalias nocapture readnone
>> %Out, i16* nocapture readonly %In, i16 signext %const) {
>> +entry:
>> +  %conv4.i.i = sext i16 %const to i32
>> +  br label %for.body
>> +
>> +for.body:
>> +  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
>> +  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
>> +  %PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
>> +  %In.0 = load i16, i16* %PIn.0, align 2
>> +  %conv.us.i144.i = sext i16 %In.0 to i32
>> +  %mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %conv4.i.i
>> +  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
>> +  store i32 %mul5.us.i.i, i32* %Out.0, align 4
>> +  %iv.1 = or i32 %iv, 1
>> +  %PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
>> +  %In.1 = load i16, i16* %PIn.1, align 2
>> +  %conv.us.i144.1.i = sext i16 %In.1 to i32
>> +  %mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %conv4.i.i
>> +  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
>> +  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
>> +  %iv.2 = or i32 %iv, 2
>> +  %PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
>> +  %In.3 = load i16, i16* %PIn.2, align 2
>> +  %conv.us.i144.2.i = sext i16 %In.3 to i32
>> +  %mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %conv4.i.i
>> +  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
>> +  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
>> +  %iv.3 = or i32 %iv, 3
>> +  %PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
>> +  %In.4 = load i16, i16* %PIn.3, align 2
>> +  %conv.us.i144.3.i = sext i16 %In.4 to i32
>> +  %mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %conv4.i.i
>> +  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
>> +  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
>> +  %iv.next = add i32 %iv, 4
>> +  %count.next = add i32 %count, -4
>> +  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
>> +  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
>> +
>> +exit:
>> +  ret void
>> +}
>> +
>> +; TODO: We should be able to handle this and use smulwt and smulwb.
>> +; CHECK-LABEL: topbottom_mul_word_load_const
>> +; CHECK-NOT: bitcast i16*
>> +define void @topbottom_mul_word_load_const(i32 %N, i32* noalias
>> nocapture readnone %Out, i16* nocapture readonly %In, i32* %C) {
>> +entry:
>> +  %const = load i32, i32* %C
>> +  br label %for.body
>> +
>> +for.body:
>> +  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
>> +  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
>> +  %PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
>> +  %In.0 = load i16, i16* %PIn.0, align 2
>> +  %conv.us.i144.i = sext i16 %In.0 to i32
>> +  %mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %const
>> +  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
>> +  store i32 %mul5.us.i.i, i32* %Out.0, align 4
>> +  %iv.1 = or i32 %iv, 1
>> +  %PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
>> +  %In.1 = load i16, i16* %PIn.1, align 2
>> +  %conv.us.i144.1.i = sext i16 %In.1 to i32
>> +  %mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %const
>> +  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
>> +  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
>> +  %iv.2 = or i32 %iv, 2
>> +  %PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
>> +  %In.3 = load i16, i16* %PIn.2, align 2
>> +  %conv.us.i144.2.i = sext i16 %In.3 to i32
>> +  %mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %const
>> +  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
>> +  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
>> +  %iv.3 = or i32 %iv, 3
>> +  %PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
>> +  %In.4 = load i16, i16* %PIn.3, align 2
>> +  %conv.us.i144.3.i = sext i16 %In.4 to i32
>> +  %mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %const
>> +  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
>> +  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
>> +  %iv.next = add i32 %iv, 4
>> +  %count.next = add i32 %count, -4
>> +  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
>> +  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
>> +
>> +exit:
>> +  ret void
>> +}
>> +
>> +; CHECK-LABEL: topbottom_mul_8
>> +; CHECK-NOT: bitcast i16*
>> +define void @topbottom_mul_8(i32 %N, i32* noalias nocapture readnone
>> %Out, i8* nocapture readonly %In1, i8* nocapture readonly %In2) {
>> +entry:
>> +  br label %for.body
>> +
>> +for.body:
>> +  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
>> +  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
>> +  %PIn1.0 = getelementptr inbounds i8, i8* %In1, i32 %iv
>> +  %In1.0 = load i8, i8* %PIn1.0, align 1
>> +  %SIn1.0 = sext i8 %In1.0 to i32
>> +  %PIn2.0 = getelementptr inbounds i8, i8* %In2, i32 %iv
>> +  %In2.0 = load i8, i8* %PIn2.0, align 1
>> +  %SIn2.0 = sext i8 %In2.0 to i32
>> +  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
>> +  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
>> +  store i32 %mul5.us.i.i, i32* %Out.0, align 4
>> +  %iv.1 = or i32 %iv, 1
>> +  %PIn1.1 = getelementptr inbounds i8, i8* %In1, i32 %iv.1
>> +  %In1.1 = load i8, i8* %PIn1.1, align 1
>> +  %SIn1.1 = sext i8 %In1.1 to i32
>> +  %PIn2.1 = getelementptr inbounds i8, i8* %In2, i32 %iv.1
>> +  %In2.1 = load i8, i8* %PIn2.1, align 1
>> +  %SIn2.1 = sext i8 %In2.1 to i32
>> +  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
>> +  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
>> +  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
>> +  %iv.2 = or i32 %iv, 2
>> +  %PIn1.2 = getelementptr inbounds i8, i8* %In1, i32 %iv.2
>> +  %In1.2 = load i8, i8* %PIn1.2, align 1
>> +  %SIn1.2 = sext i8 %In1.2 to i32
>> +  %PIn2.2 = getelementptr inbounds i8, i8* %In2, i32 %iv.2
>> +  %In2.2 = load i8, i8* %PIn2.2, align 1
>> +  %SIn2.2 = sext i8 %In2.2 to i32
>> +  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
>> +  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
>> +  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
>> +  %iv.3 = or i32 %iv, 3
>> +  %PIn1.3 = getelementptr inbounds i8, i8* %In1, i32 %iv.3
>> +  %In1.3 = load i8, i8* %PIn1.3, align 1
>> +  %SIn1.3 = sext i8 %In1.3 to i32
>> +  %PIn2.3 = getelementptr inbounds i8, i8* %In2, i32 %iv.3
>> +  %In2.3 = load i8, i8* %PIn2.3, align 1
>> +  %SIn2.3 = sext i8 %In2.3 to i32
>> +  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
>> +  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
>> +  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
>> +  %iv.next = add i32 %iv, 4
>> +  %count.next = add i32 %count, -4
>> +  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
>> +  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
>> +
>> +exit:
>> +  ret void
>> +}
>>
>> Added: llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom.ll?rev=342210&view=auto
>>
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom.ll (added)
>> +++ llvm/trunk/test/CodeGen/ARM/paralleldsp-top-bottom.ll Fri Sep 14
>> 01:09:09 2018
>> @@ -0,0 +1,251 @@
>> +; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp
>> -S | FileCheck %s
>> +
>> +; CHECK-LABEL: topbottom_mul
>> +define void @topbottom_mul(i32 %N, i32* noalias nocapture readnone %Out,
>> i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
>> +entry:
>> +  br label %for.body
>> +
>> +; CHECK: for.body:
>> +; CHECK: [[Cast_PIn1_0:%[^ ]+]] = bitcast i16* %PIn1.0 to i32*
>> +; CHECK: [[PIn1_01:%[^ ]+]] = load i32, i32* [[Cast_PIn1_0]], align 2
>> +; CHECK: [[PIn1_01_shl:%[^ ]+]] = shl i32 [[PIn1_01]], 16
>> +; CHECK: [[PIn1_0:%[^ ]+]] = ashr i32 [[PIn1_01_shl]], 16
>> +; CHECK: [[PIn1_1:%[^ ]+]] = ashr i32 [[PIn1_01]], 16
>> +
>> +; CHECK: [[Cast_PIn2_0:%[^ ]+]] = bitcast i16* %PIn2.0 to i32*
>> +; CHECK: [[PIn2_01:%[^ ]+]] = load i32, i32* [[Cast_PIn2_0]], align 2
>> +; CHECK: [[PIn2_01_shl:%[^ ]+]] = shl i32 [[PIn2_01]], 16
>> +; CHECK: [[PIn2_0:%[^ ]+]] = ashr i32 [[PIn2_01_shl]], 16
>> +; CHECK: [[PIn2_1:%[^ ]+]] = ashr i32 [[PIn2_01]], 16
>> +
>> +; CHECK: mul nsw i32 [[PIn1_0]], [[PIn2_0]]
>> +; CHECK: mul nsw i32 [[PIn1_1]], [[PIn2_1]]
>> +
>> +; CHECK: [[Cast_PIn1_2:%[^ ]+]] = bitcast i16* %PIn1.2 to i32*
>> +; CHECK: [[PIn1_23:%[^ ]+]] = load i32, i32* [[Cast_PIn1_2]], align 2
>> +; CHECK: [[PIn1_23_shl:%[^ ]+]] = shl i32 [[PIn1_23]], 16
>> +; CHECK: [[PIn1_2:%[^ ]+]] = ashr i32 [[PIn1_23_shl]], 16
>> +; CHECK: [[PIn1_3:%[^ ]+]] = ashr i32 [[PIn1_23]], 16
>> +
>> +; CHECK: [[Cast_PIn2_2:%[^ ]+]] = bitcast i16* %PIn2.2 to i32*
>> +; CHECK: [[PIn2_23:%[^ ]+]] = load i32, i32* [[Cast_PIn2_2]], align 2
>> +; CHECK: [[PIn2_23_shl:%[^ ]+]] = shl i32 [[PIn2_23]], 16
>> +; CHECK: [[PIn2_2:%[^ ]+]] = ashr i32 [[PIn2_23_shl]], 16
>> +; CHECK: [[PIn2_3:%[^ ]+]] = ashr i32 [[PIn2_23]], 16
>> +
>> +; CHECK: mul nsw i32 [[PIn1_2]], [[PIn2_2]]
>> +; CHECK: mul nsw i32 [[PIn1_3]], [[PIn2_3]]
>> +
>> +for.body:
>> +  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
>> +  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
>> +  %PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
>> +  %In1.0 = load i16, i16* %PIn1.0, align 2
>> +  %SIn1.0 = sext i16 %In1.0 to i32
>> +  %PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
>> +  %In2.0 = load i16, i16* %PIn2.0, align 2
>> +  %SIn2.0 = sext i16 %In2.0 to i32
>> +  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
>> +  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
>> +  store i32 %mul5.us.i.i, i32* %Out.0, align 4
>> +  %iv.1 = or i32 %iv, 1
>> +  %PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
>> +  %In1.1 = load i16, i16* %PIn1.1, align 2
>> +  %SIn1.1 = sext i16 %In1.1 to i32
>> +  %PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
>> +  %In2.1 = load i16, i16* %PIn2.1, align 2
>> +  %SIn2.1 = sext i16 %In2.1 to i32
>> +  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
>> +  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
>> +  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
>> +  %iv.2 = or i32 %iv, 2
>> +  %PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
>> +  %In1.2 = load i16, i16* %PIn1.2, align 2
>> +  %SIn1.2 = sext i16 %In1.2 to i32
>> +  %PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
>> +  %In2.2 = load i16, i16* %PIn2.2, align 2
>> +  %SIn2.2 = sext i16 %In2.2 to i32
>> +  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
>> +  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
>> +  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
>> +  %iv.3 = or i32 %iv, 3
>> +  %PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
>> +  %In1.3 = load i16, i16* %PIn1.3, align 2
>> +  %SIn1.3 = sext i16 %In1.3 to i32
>> +  %PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
>> +  %In2.3 = load i16, i16* %PIn2.3, align 2
>> +  %SIn2.3 = sext i16 %In2.3 to i32
>> +  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
>> +  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
>> +  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
>> +  %iv.next = add i32 %iv, 4
>> +  %count.next = add i32 %count, -4
>> +  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
>> +  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
>> +
>> +exit:
>> +  ret void
>> +}
>> +
>> +; CHECK-LABEL: topbottom_mul_load_const
>> +define void @topbottom_mul_load_const(i32 %N, i32* noalias nocapture
>> readnone %Out, i16* nocapture readonly %In, i16* %C) {
>> +entry:
>> +  %const = load i16, i16* %C
>> +  %conv4.i.i = sext i16 %const to i32
>> +  br label %for.body
>> +
>> +; CHECK: for.body:
>> +; CHECK: [[Cast_PIn_0:%[^ ]+]] = bitcast i16* %PIn.0 to i32*
>> +; CHECK: [[PIn_01:%[^ ]+]] = load i32, i32* [[Cast_PIn_0]], align 2
>> +; CHECK: [[PIn_01_shl:%[^ ]+]] = shl i32 [[PIn_01]], 16
>> +; CHECK: [[PIn_0:%[^ ]+]] = ashr i32 [[PIn_01_shl]], 16
>> +; CHECK: [[PIn_1:%[^ ]+]] = ashr i32 [[PIn_01]], 16
>> +
>> +; CHECK: mul nsw i32 [[PIn_0]], %conv4.i.i
>> +; CHECK: mul nsw i32 [[PIn_1]], %conv4.i.i
>> +
>> +; CHECK: [[Cast_PIn_2:%[^ ]+]] = bitcast i16* %PIn.2 to i32*
>> +; CHECK: [[PIn_23:%[^ ]+]] = load i32, i32* [[Cast_PIn_2]], align 2
>> +; CHECK: [[PIn_23_shl:%[^ ]+]] = shl i32 [[PIn_23]], 16
>> +; CHECK: [[PIn_2:%[^ ]+]] = ashr i32 [[PIn_23_shl]], 16
>> +; CHECK: [[PIn_3:%[^ ]+]] = ashr i32 [[PIn_23]], 16
>> +
>> +; CHECK: mul nsw i32 [[PIn_2]], %conv4.i.i
>> +; CHECK: mul nsw i32 [[PIn_3]], %conv4.i.i
>> +
>> +for.body:
>> +  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
>> +  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
>> +  %PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
>> +  %In.0 = load i16, i16* %PIn.0, align 2
>> +  %conv.us.i144.i = sext i16 %In.0 to i32
>> +  %mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %conv4.i.i
>> +  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
>> +  store i32 %mul5.us.i.i, i32* %Out.0, align 4
>> +  %iv.1 = or i32 %iv, 1
>> +  %PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
>> +  %In.1 = load i16, i16* %PIn.1, align 2
>> +  %conv.us.i144.1.i = sext i16 %In.1 to i32
>> +  %mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %conv4.i.i
>> +  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
>> +  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
>> +  %iv.2 = or i32 %iv, 2
>> +  %PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
>> +  %In.3 = load i16, i16* %PIn.2, align 2
>> +  %conv.us.i144.2.i = sext i16 %In.3 to i32
>> +  %mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %conv4.i.i
>> +  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
>> +  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
>> +  %iv.3 = or i32 %iv, 3
>> +  %PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
>> +  %In.4 = load i16, i16* %PIn.3, align 2
>> +  %conv.us.i144.3.i = sext i16 %In.4 to i32
>> +  %mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %conv4.i.i
>> +  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
>> +  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
>> +  %iv.next = add i32 %iv, 4
>> +  %count.next = add i32 %count, -4
>> +  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
>> +  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
>> +
>> +exit:
>> +  ret void
>> +}
>> +
>> +; CHECK-LABEL: topbottom_mul_64
>> +define void @topbottom_mul_64(i32 %N, i64* noalias nocapture readnone
>> %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
>> +entry:
>> +  br label %for.body
>> +
>> +; CHECK: for.body:
>> +; CHECK: [[Cast_PIn1_0:%[^ ]+]] = bitcast i16* %PIn1.0 to i32*
>> +; CHECK: [[PIn1_01:%[^ ]+]] = load i32, i32* [[Cast_PIn1_0]], align 2
>> +; CHECK: [[PIn1_01_shl:%[^ ]+]] = shl i32 [[PIn1_01]], 16
>> +; CHECK: [[PIn1_0:%[^ ]+]] = ashr i32 [[PIn1_01_shl]], 16
>> +; CHECK: [[PIn1_1:%[^ ]+]] = ashr i32 [[PIn1_01]], 16
>> +
>> +; CHECK: [[Cast_PIn2_0:%[^ ]+]] = bitcast i16* %PIn2.0 to i32*
>> +; CHECK: [[PIn2_01:%[^ ]+]] = load i32, i32* [[Cast_PIn2_0]], align 2
>> +; CHECK: [[PIn2_01_shl:%[^ ]+]] = shl i32 [[PIn2_01]], 16
>> +; CHECK: [[PIn2_0:%[^ ]+]] = ashr i32 [[PIn2_01_shl]], 16
>> +; CHECK: [[PIn2_1:%[^ ]+]] = ashr i32 [[PIn2_01]], 16
>> +
>> +; CHECK: [[Mul0:%[^ ]+]] = mul nsw i32 [[PIn1_0]], [[PIn2_0]]
>> +; CHECK: [[SMul0:%[^ ]+]] = sext i32 [[Mul0]] to i64
>> +; CHECK: [[Mul1:%[^ ]+]] = mul nsw i32 [[PIn1_1]], [[PIn2_1]]
>> +; CHECK: [[SMul1:%[^ ]+]] = sext i32 [[Mul1]] to i64
>> +; CHECK: add i64 [[SMul0]], [[SMul1]]
>> +
>> +; CHECK: [[Cast_PIn1_2:%[^ ]+]] = bitcast i16* %PIn1.2 to i32*
>> +; CHECK: [[PIn1_23:%[^ ]+]] = load i32, i32* [[Cast_PIn1_2]], align 2
>> +; CHECK: [[PIn1_23_shl:%[^ ]+]] = shl i32 [[PIn1_23]], 16
>> +; CHECK: [[PIn1_2:%[^ ]+]] = ashr i32 [[PIn1_23_shl]], 16
>> +; CHECK: [[PIn1_3:%[^ ]+]] = ashr i32 [[PIn1_23]], 16
>> +
>> +; CHECK: [[Cast_PIn2_2:%[^ ]+]] = bitcast i16* %PIn2.2 to i32*
>> +; CHECK: [[PIn2_23:%[^ ]+]] = load i32, i32* [[Cast_PIn2_2]], align 2
>> +; CHECK: [[PIn2_23_shl:%[^ ]+]] = shl i32 [[PIn2_23]], 16
>> +; CHECK: [[PIn2_2:%[^ ]+]] = ashr i32 [[PIn2_23_shl]], 16
>> +; CHECK: [[PIn2_3:%[^ ]+]] = ashr i32 [[PIn2_23]], 16
>> +
>> +; CHECK: [[Mul2:%[^ ]+]] = mul nsw i32 [[PIn1_2]], [[PIn2_2]]
>> +; CHECK: [[SMul2:%[^ ]+]] = sext i32 [[Mul2]] to i64
>> +; CHECK: [[Mul3:%[^ ]+]] = mul nsw i32 [[PIn1_3]], [[PIn2_3]]
>> +; CHECK: [[SMul3:%[^ ]+]] = sext i32 [[Mul3]] to i64
>> +; CHECK: add i64 [[SMul2]], [[SMul3]]
>> +
>> +for.body:
>> +  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
>> +  %iv.out = phi i32 [ 0, %entry] , [ %iv.out.next, %for.body ]
>> +  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
>> +  %PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
>> +  %In1.0 = load i16, i16* %PIn1.0, align 2
>> +  %SIn1.0 = sext i16 %In1.0 to i32
>> +  %PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
>> +  %In2.0 = load i16, i16* %PIn2.0, align 2
>> +  %SIn2.0 = sext i16 %In2.0 to i32
>> +  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
>> +  %sext.0 = sext i32 %mul5.us.i.i to i64
>> +  %iv.1 = or i32 %iv, 1
>> +  %PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
>> +  %In1.1 = load i16, i16* %PIn1.1, align 2
>> +  %SIn1.1 = sext i16 %In1.1 to i32
>> +  %PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
>> +  %In2.1 = load i16, i16* %PIn2.1, align 2
>> +  %SIn2.1 = sext i16 %In2.1 to i32
>> +  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
>> +  %sext.1 = sext i32 %mul5.us.i.1.i to i64
>> +  %mac.0 = add i64 %sext.0, %sext.1
>> +  %Out.0 = getelementptr inbounds i64, i64* %Out, i32 %iv.out
>> +  store i64 %mac.0, i64* %Out.0, align 4
>> +  %iv.2 = or i32 %iv, 2
>> +  %PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
>> +  %In1.2 = load i16, i16* %PIn1.2, align 2
>> +  %SIn1.2 = sext i16 %In1.2 to i32
>> +  %PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
>> +  %In2.2 = load i16, i16* %PIn2.2, align 2
>> +  %SIn2.2 = sext i16 %In2.2 to i32
>> +  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
>> +  %sext.2 = sext i32 %mul5.us.i.2.i to i64
>> +  %iv.3 = or i32 %iv, 3
>> +  %PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
>> +  %In1.3 = load i16, i16* %PIn1.3, align 2
>> +  %SIn1.3 = sext i16 %In1.3 to i32
>> +  %PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
>> +  %In2.3 = load i16, i16* %PIn2.3, align 2
>> +  %SIn2.3 = sext i16 %In2.3 to i32
>> +  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
>> +  %sext.3 = sext i32 %mul5.us.i.3.i to i64
>> +  %mac.1 = add i64 %sext.2, %sext.3
>> +  %iv.out.1 = or i32 %iv.out, 1
>> +  %Out.1 = getelementptr inbounds i64, i64* %Out, i32 %iv.out.1
>> +  store i64 %mac.1, i64* %Out.1, align 4
>> +  %iv.next = add i32 %iv, 4
>> +  %iv.out.next = add i32 %iv.out, 2
>> +  %count.next = add i32 %count, -4
>> +  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
>> +  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
>> +
>> +exit:
>> +  ret void
>> +}
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180914/7ed90a85/attachment.html>