[llvm] r179414 - SLPVectorizer: add support for vectorization of diamond shaped trees. We now perform a preliminary traversal of the graph to collect values with multiple users and check where the users came from.
Nadav Rotem
nrotem at apple.com
Fri Apr 12 14:16:54 PDT 2013
Author: nadav
Date: Fri Apr 12 16:16:54 2013
New Revision: 179414
URL: http://llvm.org/viewvc/llvm-project?rev=179414&view=rev
Log:
SLPVectorizer: add support for vectorization of diamond shaped trees. We now perform a preliminary traversal of the graph to collect values with multiple users and check where the users came from.
Added:
llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll
Modified:
llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp
llvm/trunk/lib/Transforms/Vectorize/VecUtils.h
Modified: llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp?rev=179414&r1=179413&r2=179414&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp Fri Apr 12 16:16:54 2013
@@ -6,7 +6,7 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "VecUtils"
+#define DEBUG_TYPE "SLP"
#include "VecUtils.h"
#include "llvm/ADT/DenseMap.h"
@@ -37,6 +37,10 @@
using namespace llvm;
+static const unsigned MinVecRegSize = 128;
+
+static const unsigned RecursionMaxDepth = 6;
+
namespace llvm {
BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
@@ -98,9 +102,39 @@ bool BoUpSLP::isConsecutiveAccess(Value
return ((-Offset) == Sz);
}
+bool BoUpSLP::vectorizeStoreChain(ValueList &Chain, int CostThreshold) {
+ Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
+ unsigned Sz = DL->getTypeSizeInBits(StoreTy);
+ unsigned VF = MinVecRegSize / Sz;
+
+ if (!isPowerOf2_32(Sz) || VF < 2) return false;
+
+ bool Changed = false;
+ for (unsigned i = 0, e = Chain.size(); i < e; ++i) {
+ if (i + VF > e) return Changed;
+ DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n");
+ ValueList Operands(&Chain[i], &Chain[i] + VF);
+
+ int Cost = getTreeCost(Operands);
+ DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+ if (Cost < CostThreshold) {
+ DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ vectorizeTree(Operands, VF);
+ i += VF;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) {
ValueSet Heads, Tails;
SmallDenseMap<Value*, Value*> ConsecutiveChain;
+
+ // We may run into multiple chains that merge into a single chain. We mark the
+ // stores that we vectorized so that we don't visit the same store twice.
+ ValueSet VectorizedStores;
bool Changed = false;
// Do a quadratic search on all of the given stores and find
@@ -123,27 +157,17 @@ bool BoUpSLP::vectorizeStores(StoreList
// to vectorize it.
ValueList Operands;
Value *I = *it;
- int MinCost = 0, MinVF = 0;
+ // Collect the chain into a list.
while (Tails.count(I) || Heads.count(I)) {
+ if (VectorizedStores.count(I)) break;
Operands.push_back(I);
- unsigned VF = Operands.size();
- if (isPowerOf2_32(VF) && VF > 1) {
- int cost = getTreeRollCost(Operands, 0);
- DEBUG(dbgs() << "Found cost=" << cost << " for VF=" << VF << "\n");
- if (cost < MinCost) { MinCost = cost; MinVF = VF; }
- }
// Move to the next value in the chain.
I = ConsecutiveChain[I];
}
- if (MinCost <= costThreshold && MinVF > 1) {
- DEBUG(dbgs() << "Decided to vectorize cost=" << MinCost << "\n");
- vectorizeTree(Operands, MinVF);
- Stores.clear();
- // The current numbering is invalid because we added and removed instrs.
- numberInstructions();
- Changed = true;
- }
+ bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
+ if (Vectorized) VectorizedStores.insert(Operands.begin(), Operands.end());
+ Changed |= Vectorized;
}
return Changed;
@@ -184,8 +208,138 @@ Value *BoUpSLP::isUnsafeToSink(Instructi
return 0;
}
-int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) {
- if (Depth == 6) return max_cost;
+int BoUpSLP::getTreeCost(ValueList &VL) {
+ // Get rid of the list of stores that were removed, and from the
+ // lists of instructions with multiple users.
+ MemBarrierIgnoreList.clear();
+ LaneMap.clear();
+ MultiUserVals.clear();
+ MustScalarize.clear();
+
+ // Scan the tree and find which value is used by which lane, and which values
+ // must be scalarized.
+ getTreeUses_rec(VL, 0);
+
+ // Check that instructions with multiple users can be vectorized. Mark unsafe
+ // instructions.
+ for (ValueSet::iterator it = MultiUserVals.begin(),
+ e = MultiUserVals.end(); it != e; ++it) {
+ // Check that all of the users of this instr are within the tree
+ // and that they are all from the same lane.
+ int Lane = -1;
+ for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
+ I != E; ++I) {
+ if (LaneMap.find(*I) == LaneMap.end()) {
+ MustScalarize.insert(*it);
+ DEBUG(dbgs()<<"SLP: Adding " << **it <<
+ " to MustScalarize because of an out of tree usage.\n");
+ break;
+ }
+ if (Lane == -1) Lane = LaneMap[*I];
+ if (Lane != LaneMap[*I]) {
+ MustScalarize.insert(*it);
+ DEBUG(dbgs()<<"Adding " << **it <<
+ " to MustScalarize because multiple lane use it: "
+ << Lane << " and " << LaneMap[*I] << ".\n");
+ break;
+ }
+ }
+ }
+
+ // Now calculate the cost of vectorizing the tree.
+ return getTreeCost_rec(VL, 0);
+}
+
+void BoUpSLP::getTreeUses_rec(ValueList &VL, unsigned Depth) {
+ if (Depth == RecursionMaxDepth) return;
+
+ // Don't handle vectors.
+ if (VL[0]->getType()->isVectorTy()) return;
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ if (SI->getValueOperand()->getType()->isVectorTy()) return;
+
+ // Check if all of the operands are constants.
+ bool AllConst = true;
+ bool AllSameScalar = true;
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ AllConst &= isa<Constant>(VL[i]);
+ AllSameScalar &= (VL[0] == VL[i]);
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ // If one of the instructions is out of this BB, we need to scalarize all.
+ if (I && I->getParent() != BB) return;
+ }
+
+ // If all of the operands are identical or constant we have a simple solution.
+ if (AllConst || AllSameScalar) return;
+
+ // Scalarize unknown structures.
+ Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+ if (!VL0) return;
+
+ unsigned Opcode = VL0->getOpcode();
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ // If not all of the instructions are identical then we have to scalarize.
+ if (!I || Opcode != I->getOpcode()) return;
+ }
+
+ // Mark instructions with multiple users.
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ // Remember to check if all of the users of this instr are vectorized
+ // within our tree.
+ if (I && I->getNumUses() > 1) MultiUserVals.insert(I);
+ }
+
+ for (int i = 0, e = VL.size(); i < e; ++i) {
+ // Check that the instruction is only used within
+ // one lane.
+ if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return;
+ // Make this instruction as 'seen' and remember the lane.
+ LaneMap[VL[i]] = i;
+ }
+
+ switch (Opcode) {
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ getTreeUses_rec(Operands, Depth+1);
+ }
+ }
+ case Instruction::Store: {
+ ValueList Operands;
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+ getTreeUses_rec(Operands, Depth+1);
+ return;
+ }
+ default:
+ return;
+ }
+}
+
+int BoUpSLP::getTreeCost_rec(ValueList &VL, unsigned Depth) {
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -193,9 +347,10 @@ int BoUpSLP::getTreeRollCost(ValueList &
/// Don't mess with vectors.
if (ScalarTy->isVectorTy()) return max_cost;
-
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+ if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy);
+
// Check if all of the operands are constants.
bool AllConst = true;
bool AllSameScalar = true;
@@ -204,8 +359,8 @@ int BoUpSLP::getTreeRollCost(ValueList &
AllSameScalar &= (VL[0] == VL[i]);
// Must have a single use.
Instruction *I = dyn_cast<Instruction>(VL[i]);
- // Need to scalarize instructions with multiple users or from other BBs.
- if (I && ((I->getNumUses() > 1) || (I->getParent() != BB)))
+ // This instruction is outside the basic block or if it is a known hazard.
+ if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
return getScalarizationCost(VecTy);
}
@@ -239,7 +394,7 @@ int BoUpSLP::getTreeRollCost(ValueList &
if (VL[i] == Last) continue;
Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
if (Barrier) {
- DEBUG(dbgs() << "LR: Can't sink " << *VL[i] << "\n down to " <<
+ DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " <<
*Last << "\n because of " << *Barrier << "\n");
return max_cost;
}
@@ -265,20 +420,22 @@ int BoUpSLP::getTreeRollCost(ValueList &
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
- ValueList Operands;
int Cost = 0;
// Calculate the cost of all of the operands.
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
// Prepare the operand vector.
for (unsigned j = 0; j < VL.size(); ++j)
Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
- Cost += getTreeRollCost(Operands, Depth+1);
- Operands.clear();
+
+ Cost += getTreeCost_rec(Operands, Depth+1);
+ if (Cost >= max_cost) return max_cost;
}
// Calculate the cost of this instruction.
int ScalarCost = VecTy->getNumElements() *
TTI->getArithmeticInstrCost(Opcode, ScalarTy);
+
int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
Cost += (VecCost - ScalarCost);
return Cost;
@@ -308,8 +465,7 @@ int BoUpSLP::getTreeRollCost(ValueList &
MemBarrierIgnoreList.insert(VL[j]);
}
- int TotalCost = StoreCost + getTreeRollCost(Operands, Depth + 1);
- MemBarrierIgnoreList.clear();
+ int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
return TotalCost;
}
default:
@@ -334,6 +490,15 @@ Value *BoUpSLP::Scalarize(ValueList &VL,
}
Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) {
+ Value *V = vectorizeTree_rec(VL, VF);
+ // We moved some instructions around. We have to number them again
+ // before we can do any analysis.
+ numberInstructions();
+ MustScalarize.clear();
+ return V;
+}
+
+Value *BoUpSLP::vectorizeTree_rec(ValueList &VL, int VF) {
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
@@ -345,19 +510,21 @@ Value *BoUpSLP::vectorizeTree(ValueList
for (unsigned i = 0, e = VF; i < e; ++i) {
AllConst &= !!dyn_cast<Constant>(VL[i]);
AllSameScalar &= (VL[0] == VL[i]);
- // Must have a single use.
+ // The instruction must be in the same BB, and it must be vectorizable.
Instruction *I = dyn_cast<Instruction>(VL[i]);
- if (I && (I->getNumUses() > 1 || I->getParent() != BB))
+ if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
return Scalarize(VL, VecTy);
}
- // Is this a simple vector constant.
+ // Check that this is a simple vector constant.
if (AllConst || AllSameScalar) return Scalarize(VL, VecTy);
// Scalarize unknown structures.
Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
if (!VL0) return Scalarize(VL, VecTy);
+ if (VectorizedValues.count(VL0)) return VectorizedValues[VL0];
+
unsigned Opcode = VL0->getOpcode();
for (unsigned i = 0, e = VF; i < e; ++i) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
@@ -390,11 +557,13 @@ Value *BoUpSLP::vectorizeTree(ValueList
LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
}
- Value *RHS = vectorizeTree(RHSVL, VF);
- Value *LHS = vectorizeTree(LHSVL, VF);
+ Value *RHS = vectorizeTree_rec(RHSVL, VF);
+ Value *LHS = vectorizeTree_rec(LHSVL, VF);
IRBuilder<> Builder(GetLastInstr(VL, VF));
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(VL0);
- return Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
+ Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
+ VectorizedValues[VL0] = V;
+ return V;
}
case Instruction::Load: {
LoadInst *LI = dyn_cast<LoadInst>(VL0);
@@ -410,6 +579,7 @@ Value *BoUpSLP::vectorizeTree(ValueList
VecTy->getPointerTo());
LI = Builder.CreateLoad(VecPtr);
LI->setAlignment(Alignment);
+ VectorizedValues[VL0] = LI;
return LI;
}
case Instruction::Store: {
@@ -420,7 +590,7 @@ Value *BoUpSLP::vectorizeTree(ValueList
for (int i = 0; i < VF; ++i)
ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
- Value *VecValue = vectorizeTree(ValueOp, VF);
+ Value *VecValue = vectorizeTree_rec(ValueOp, VF);
IRBuilder<> Builder(GetLastInstr(VL, VF));
Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
@@ -432,7 +602,9 @@ Value *BoUpSLP::vectorizeTree(ValueList
return 0;
}
default:
- return Scalarize(VL, VecTy);
+ Value *S = Scalarize(VL, VecTy);
+ VectorizedValues[VL0] = S;
+ return S;
}
}
Modified: llvm/trunk/lib/Transforms/Vectorize/VecUtils.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/VecUtils.h?rev=179414&r1=179413&r2=179414&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/VecUtils.h (original)
+++ llvm/trunk/lib/Transforms/Vectorize/VecUtils.h Fri Apr 12 16:16:54 2013
@@ -42,6 +42,14 @@ struct BoUpSLP {
BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl,
TargetTransformInfo *Tti, AliasAnalysis *Aa);
+ /// \brief Take the pointer operand from the Load/Store instruction.
+ /// \returns NULL if this is not a valid Load/Store instruction.
+ static Value *getPointerOperand(Value *I);
+
+ /// \brief Take the address space operand from the Load/Store instruction.
+ /// \returns -1 if this is not a valid Load/Store instruction.
+ static unsigned getAddressSpaceOperand(Value *I);
+
/// \returns true if the memory operations A and B are consecutive.
bool isConsecutiveAccess(Value *A, Value *B);
@@ -51,25 +59,31 @@ struct BoUpSLP {
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
- int getTreeRollCost(ValueList &VL, unsigned Depth);
-
- /// \brief Take the pointer operand from the Load/Store instruction.
- /// \returns NULL if this is not a valid Load/Store instruction.
- static Value *getPointerOperand(Value *I);
-
- /// \brief Take the address space operand from the Load/Store instruction.
- /// \returns -1 if this is not a valid Load/Store instruction.
- static unsigned getAddressSpaceOperand(Value *I);
+ int getTreeCost(ValueList &VL);
/// \brief Attempts to order and vectorize a sequence of stores. This
/// function does a quadratic scan of the given stores.
/// \returns true if the basic block was modified.
bool vectorizeStores(StoreList &Stores, int costThreshold);
+private:
+ /// \returns This method contains the recursive part of getTreeCost.
+ int getTreeCost_rec(ValueList &VL, unsigned Depth);
+
+ /// \returns This recursive method looks for vectorization hazards such as
+ /// values that are used by multiple users and checks that values are used
+ /// by only one vector lane. It updates the variables LaneMap, MultiUserVals.
+ void getTreeUses_rec(ValueList &VL, unsigned Depth);
+
+ /// \brief This method contains the recursive part of vectorizeTree.
+ Value *vectorizeTree_rec(ValueList &VL, int VF);
+
/// \brief Number all of the instructions in the block.
void numberInstructions();
-private:
+ /// \brief Vectorize a sorted sequence of stores.
+ bool vectorizeStoreChain(ValueList &Chain, int CostThreshold);
+
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
int getScalarizationCost(Type *Ty);
@@ -89,12 +103,34 @@ private:
/// \returns a vector from a collection of scalars in \p VL.
Value *Scalarize(ValueList &VL, VectorType *Ty);
+private:
// Maps instructions to numbers and back.
SmallDenseMap<Value*, int> InstrIdx;
+ // Maps integers to Instructions.
std::vector<Instruction*> InstrVec;
+
+ // -- containers that are used during getTreeCost -- //
+
+ /// Contains values that must be scalarized because they are used
+ /// by multiple lanes, or by users outside the tree.
+ /// NOTICE: The vectorization methods also use this set.
+ ValueSet MustScalarize;
+
+ // Contains a list of values that are used outside the current tree. This
+ // set must be reset between runs.
+ ValueSet MultiUserVals;
+ // Maps values in the tree to the vector lanes that uses them. This map must
+ // be reset between runs of getCost.
+ std::map<Value*, int> LaneMap;
// A list of instructions to ignore while sinking
- // memory instructions.
+ // memory instructions. This map must be reset between runs of getCost.
SmallSet<Value*, 8> MemBarrierIgnoreList;
+
+ // -- containers that are used during vectorizeTree -- //
+ // Maps between the first scalar to the vector. This map must be reset between
+ // runs.
+ DenseMap<Value*, Value*> VectorizedValues;
+
// Analysis and block reference.
BasicBlock *BB;
ScalarEvolution *SE;
Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll?rev=179414&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll Fri Apr 12 16:16:54 2013
@@ -0,0 +1,83 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; int foo(int * restrict B, int * restrict A, int n, int m) {
+; B[0] = n * A[0] + m * A[0];
+; B[1] = n * A[1] + m * A[1];
+; B[2] = n * A[2] + m * A[2];
+; B[3] = n * A[3] + m * A[3];
+; return 0;
+; }
+
+; CHECK: @foo
+; CHECK: load <4 x i32>
+; CHECK: mul <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: ret
+define i32 @foo(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) #0 {
+entry:
+ %0 = load i32* %A, align 4, !tbaa !0
+ %mul238 = add i32 %m, %n
+ %add = mul i32 %0, %mul238
+ store i32 %add, i32* %B, align 4, !tbaa !0
+ %arrayidx4 = getelementptr inbounds i32* %A, i64 1
+ %1 = load i32* %arrayidx4, align 4, !tbaa !0
+ %add8 = mul i32 %1, %mul238
+ %arrayidx9 = getelementptr inbounds i32* %B, i64 1
+ store i32 %add8, i32* %arrayidx9, align 4, !tbaa !0
+ %arrayidx10 = getelementptr inbounds i32* %A, i64 2
+ %2 = load i32* %arrayidx10, align 4, !tbaa !0
+ %add14 = mul i32 %2, %mul238
+ %arrayidx15 = getelementptr inbounds i32* %B, i64 2
+ store i32 %add14, i32* %arrayidx15, align 4, !tbaa !0
+ %arrayidx16 = getelementptr inbounds i32* %A, i64 3
+ %3 = load i32* %arrayidx16, align 4, !tbaa !0
+ %add20 = mul i32 %3, %mul238
+ %arrayidx21 = getelementptr inbounds i32* %B, i64 3
+ store i32 %add20, i32* %arrayidx21, align 4, !tbaa !0
+ ret i32 0
+}
+
+
+; int foo_fail(int * restrict B, int * restrict A, int n, int m) {
+; B[0] = n * A[0] + m * A[0];
+; B[1] = n * A[1] + m * A[1];
+; B[2] = n * A[2] + m * A[2];
+; B[3] = n * A[3] + m * A[3];
+; return A[0];
+; }
+
+; CHECK: @foo_fail
+; CHECK-NOT: load <4 x i32>
+; CHECK: ret
+define i32 @foo_fail(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) #0 {
+entry:
+ %0 = load i32* %A, align 4, !tbaa !0
+ %mul238 = add i32 %m, %n
+ %add = mul i32 %0, %mul238
+ store i32 %add, i32* %B, align 4, !tbaa !0
+ %arrayidx4 = getelementptr inbounds i32* %A, i64 1
+ %1 = load i32* %arrayidx4, align 4, !tbaa !0
+ %add8 = mul i32 %1, %mul238
+ %arrayidx9 = getelementptr inbounds i32* %B, i64 1
+ store i32 %add8, i32* %arrayidx9, align 4, !tbaa !0
+ %arrayidx10 = getelementptr inbounds i32* %A, i64 2
+ %2 = load i32* %arrayidx10, align 4, !tbaa !0
+ %add14 = mul i32 %2, %mul238
+ %arrayidx15 = getelementptr inbounds i32* %B, i64 2
+ store i32 %add14, i32* %arrayidx15, align 4, !tbaa !0
+ %arrayidx16 = getelementptr inbounds i32* %A, i64 3
+ %3 = load i32* %arrayidx16, align 4, !tbaa !0
+ %add20 = mul i32 %3, %mul238
+ %arrayidx21 = getelementptr inbounds i32* %B, i64 3
+ store i32 %add20, i32* %arrayidx21, align 4, !tbaa !0
+ ret i32 %0 ;<--------- This value has multiple users and can't be vectorized.
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
More information about the llvm-commits
mailing list