[llvm] r216946 - Generate extract for in-tree uses if the use is scalar operand in vectorized instruction. radar://18144665
Yi Jiang
yjiang at apple.com
Tue Sep 2 14:00:40 PDT 2014
Author: yjiang
Date: Tue Sep 2 16:00:39 2014
New Revision: 216946
URL: http://llvm.org/viewvc/llvm-project?rev=216946&view=rev
Log:
Generate extract for in-tree uses if the use is scalar operand in vectorized instruction. radar://18144665
Added:
llvm/trunk/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
Modified:
llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=216946&r1=216945&r2=216946&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Tue Sep 2 16:00:39 2014
@@ -342,6 +342,33 @@ static void reorderInputsAccordingToOpco
}
}
+/// \returns True if in-tree use also needs extract. This refers to
+/// possible scalar operand in vectorized instruction.
+static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
+ TargetLibraryInfo *TLI) {
+
+ unsigned Opcode = UserInst->getOpcode();
+ switch (Opcode) {
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(UserInst);
+ return (LI->getPointerOperand() == Scalar);
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(UserInst);
+ return (SI->getPointerOperand() == Scalar);
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(UserInst);
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ if (hasVectorInstrinsicScalarOpd(ID, 1)) {
+ return (CI->getArgOperand(1) == Scalar);
+ }
+ }
+ default:
+ return false;
+ }
+}
+
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
public:
@@ -864,18 +891,27 @@ void BoUpSLP::buildTree(ArrayRef<Value *
for (User *U : Scalar->users()) {
DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
- // Skip in-tree scalars that become vectors.
- if (ScalarToTreeEntry.count(U)) {
- DEBUG(dbgs() << "SLP: \tInternal user will be removed:" <<
- *U << ".\n");
- int Idx = ScalarToTreeEntry[U]; (void) Idx;
- assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
- continue;
- }
Instruction *UserInst = dyn_cast<Instruction>(U);
if (!UserInst)
continue;
+ // Skip in-tree scalars that become vectors
+ if (ScalarToTreeEntry.count(U)) {
+ int Idx = ScalarToTreeEntry[U];
+ TreeEntry *UseEntry = &VectorizableTree[Idx];
+ Value *UseScalar = UseEntry->Scalars[0];
+ // Some in-tree scalars will remain as scalar in vectorized
+ // instructions. If that is the case, the one in Lane 0 will
+ // be used.
+ if (UseScalar != U ||
+ !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+ DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+ << ".\n");
+ assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
+ continue;
+ }
+ }
+
// Ignore users in the user ignore list.
if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
UserIgnoreList.end())
@@ -1190,16 +1226,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
}
}
- // We combine only GEPs with a single use.
- for (unsigned j = 0; j < VL.size(); ++j) {
- if (cast<Instruction>(VL[j])->getNumUses() > 1) {
- DEBUG(dbgs() << "SLP: not-vectorizable GEP (multiple uses).\n");
- BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
- return;
- }
- }
-
// We can't combine several GEPs into one vector if they operate on
// different types.
Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
@@ -2023,6 +2049,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
VecTy->getPointerTo(AS));
+
+ // The pointer operand uses an in-tree scalar so we add the new BitCast to
+ // ExternalUses list to make sure that an extract will be generated in the
+ // future.
+ if (ScalarToTreeEntry.count(LI->getPointerOperand()))
+ ExternalUses.push_back(
+ ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
+
unsigned Alignment = LI->getAlignment();
LI = Builder.CreateLoad(VecPtr);
if (!Alignment)
@@ -2047,6 +2081,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
VecTy->getPointerTo(AS));
StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+
+ // The pointer operand uses an in-tree scalar so we add the new BitCast to
+ // ExternalUses list to make sure that an extract will be generated in the
+ // future.
+ if (ScalarToTreeEntry.count(SI->getPointerOperand()))
+ ExternalUses.push_back(
+ ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
+
if (!Alignment)
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
S->setAlignment(Alignment);
@@ -2088,6 +2130,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
setInsertPointAfterBundle(E->Scalars);
Function *FI;
Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ Value *ScalarArg = nullptr;
if (CI && (FI = CI->getCalledFunction())) {
IID = (Intrinsic::ID) FI->getIntrinsicID();
}
@@ -2098,6 +2141,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
// a scalar. This argument should not be vectorized.
if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
CallInst *CEI = cast<CallInst>(E->Scalars[0]);
+ ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
continue;
}
@@ -2116,6 +2160,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
Value *V = Builder.CreateCall(CF, OpVecs);
+
+ // The scalar argument uses an in-tree scalar so we add the new vectorized
+ // call to ExternalUses list to make sure that an extract will be
+ // generated in the future.
+ if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
+ ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll?rev=216946&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll Tue Sep 2 16:00:39 2014
@@ -0,0 +1,70 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=i386-apple-macosx10.9.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+ at a = common global i64* null, align 8
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @fn1() {
+entry:
+ %0 = load i64** @a, align 8
+ %add.ptr = getelementptr inbounds i64* %0, i64 11
+ %1 = ptrtoint i64* %add.ptr to i64
+ store i64 %1, i64* %add.ptr, align 8
+ %add.ptr1 = getelementptr inbounds i64* %0, i64 56
+ %2 = ptrtoint i64* %add.ptr1 to i64
+ %arrayidx2 = getelementptr inbounds i64* %0, i64 12
+ store i64 %2, i64* %arrayidx2, align 8
+ ret i32 undef
+; CHECK-LABEL: @fn1(
+; CHECK: extractelement <2 x i64*>
+; CHECK: ret
+}
+
+
+declare float @llvm.powi.f32(float, i32)
+define void @fn2(i32* %a, i32* %b, float* %c) {
+entry:
+ %i0 = load i32* %a, align 4
+ %i1 = load i32* %b, align 4
+ %add1 = add i32 %i0, %i1
+ %fp1 = sitofp i32 %add1 to float
+ %call1 = tail call float @llvm.powi.f32(float %fp1,i32 %add1) nounwind readnone
+
+ %arrayidx2 = getelementptr inbounds i32* %a, i32 1
+ %i2 = load i32* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds i32* %b, i32 1
+ %i3 = load i32* %arrayidx3, align 4
+ %add2 = add i32 %i2, %i3
+ %fp2 = sitofp i32 %add2 to float
+ %call2 = tail call float @llvm.powi.f32(float %fp2,i32 %add1) nounwind readnone
+
+ %arrayidx4 = getelementptr inbounds i32* %a, i32 2
+ %i4 = load i32* %arrayidx4, align 4
+ %arrayidx5 = getelementptr inbounds i32* %b, i32 2
+ %i5 = load i32* %arrayidx5, align 4
+ %add3 = add i32 %i4, %i5
+ %fp3 = sitofp i32 %add3 to float
+ %call3 = tail call float @llvm.powi.f32(float %fp3,i32 %add1) nounwind readnone
+
+ %arrayidx6 = getelementptr inbounds i32* %a, i32 3
+ %i6 = load i32* %arrayidx6, align 4
+ %arrayidx7 = getelementptr inbounds i32* %b, i32 3
+ %i7 = load i32* %arrayidx7, align 4
+ %add4 = add i32 %i6, %i7
+ %fp4 = sitofp i32 %add4 to float
+ %call4 = tail call float @llvm.powi.f32(float %fp4,i32 %add1) nounwind readnone
+
+ store float %call1, float* %c, align 4
+ %arrayidx8 = getelementptr inbounds float* %c, i32 1
+ store float %call2, float* %arrayidx8, align 4
+ %arrayidx9 = getelementptr inbounds float* %c, i32 2
+ store float %call3, float* %arrayidx9, align 4
+ %arrayidx10 = getelementptr inbounds float* %c, i32 3
+ store float %call4, float* %arrayidx10, align 4
+ ret void
+
+; CHECK-LABEL: @fn2(
+; CHECK: extractelement <4 x i32>
+; CHECK: ret
+}
More information about the llvm-commits
mailing list