[llvm] r224897 - Scalarizer for masked load and store intrinsics.
Elena Demikhovsky
elena.demikhovsky at intel.com
Sun Dec 28 00:54:45 PST 2014
Author: delena
Date: Sun Dec 28 02:54:45 2014
New Revision: 224897
URL: http://llvm.org/viewvc/llvm-project?rev=224897&view=rev
Log:
Scalarizer for masked load and store intrinsics.
Masked vector intrinsics are a part of common LLVM IR, but they are really supported on AVX2 and AVX-512 targets. I added a code that translates masked intrinsic for all other targets. The masked vector intrinsic is converted to a chain of scalar operations inside conditional basic blocks.
http://reviews.llvm.org/D6436
Modified:
llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
llvm/trunk/test/CodeGen/X86/masked_memop.ll
Modified: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp?rev=224897&r1=224896&r2=224897&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp (original)
+++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp Sun Dec 28 02:54:45 2014
@@ -164,11 +164,11 @@ class TypePromotionTransaction;
bool EliminateMostlyEmptyBlocks(Function &F);
bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
void EliminateMostlyEmptyBlock(BasicBlock *BB);
- bool OptimizeBlock(BasicBlock &BB);
- bool OptimizeInst(Instruction *I);
+ bool OptimizeBlock(BasicBlock &BB, bool& ModifiedDT);
+ bool OptimizeInst(Instruction *I, bool& ModifiedDT);
bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy);
bool OptimizeInlineAsmInst(CallInst *CS);
- bool OptimizeCallInst(CallInst *CI);
+ bool OptimizeCallInst(CallInst *CI, bool& ModifiedDT);
bool MoveExtToFormExtLoad(Instruction *&I);
bool OptimizeExtUses(Instruction *I);
bool OptimizeSelectInst(SelectInst *SI);
@@ -245,7 +245,13 @@ bool CodeGenPrepare::runOnFunction(Funct
MadeChange = false;
for (Function::iterator I = F.begin(); I != F.end(); ) {
BasicBlock *BB = I++;
- MadeChange |= OptimizeBlock(*BB);
+ bool ModifiedDTOnIteration = false;
+ MadeChange |= OptimizeBlock(*BB, ModifiedDTOnIteration);
+
+ // Restart BB iteration if the dominator tree of the Function was changed
+ ModifiedDT |= ModifiedDTOnIteration;
+ if (ModifiedDTOnIteration)
+ break;
}
EverMadeChange |= MadeChange;
}
@@ -857,7 +863,211 @@ protected:
};
} // end anonymous namespace
-bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
+// ScalarizeMaskedLoad() translates masked load intrinsic, like
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+// <16 x i1> %mask, <16 x i32> %passthru)
+// to a chain of basic blocks, whith loading element one-by-one if
+// the appropriate mask bit is set
+//
+// %1 = bitcast i8* %addr to i32*
+// %2 = extractelement <16 x i1> %mask, i32 0
+// %3 = icmp eq i1 %2, true
+// br i1 %3, label %cond.load, label %else
+//
+//cond.load: ; preds = %0
+// %4 = getelementptr i32* %1, i32 0
+// %5 = load i32* %4
+// %6 = insertelement <16 x i32> undef, i32 %5, i32 0
+// br label %else
+//
+//else: ; preds = %0, %cond.load
+// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
+// %7 = extractelement <16 x i1> %mask, i32 1
+// %8 = icmp eq i1 %7, true
+// br i1 %8, label %cond.load1, label %else2
+//
+//cond.load1: ; preds = %else
+// %9 = getelementptr i32* %1, i32 1
+// %10 = load i32* %9
+// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
+// br label %else2
+//
+//else2: ; preds = %else, %cond.load1
+// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+// %12 = extractelement <16 x i1> %mask, i32 2
+// %13 = icmp eq i1 %12, true
+// br i1 %13, label %cond.load4, label %else5
+//
+static void ScalarizeMaskedLoad(CallInst *CI) {
+ Value *Ptr = CI->getArgOperand(0);
+ Value *Src0 = CI->getArgOperand(3);
+ Value *Mask = CI->getArgOperand(2);
+ VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+ Type *EltTy = VecType->getElementType();
+
+ assert(VecType && "Unexpected return type of masked load intrinsic");
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ BasicBlock *CondBlock = nullptr;
+ BasicBlock *PrevIfBlock = CI->getParent();
+ Builder.SetInsertPoint(InsertPt);
+
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ // Bitcast %addr fron i8* to EltTy*
+ Type *NewPtrType =
+ EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+ Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+ Value *UndefVal = UndefValue::get(VecType);
+
+ // The result vector
+ Value *VResult = UndefVal;
+
+ PHINode *Phi = nullptr;
+ Value *PrevPhi = UndefVal;
+
+ unsigned VectorWidth = VecType->getNumElements();
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+ // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+ // %to_load = icmp eq i1 %mask_1, true
+ // br i1 %to_load, label %cond.load, label %else
+ //
+ if (Idx > 0) {
+ Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+ Phi->addIncoming(VResult, CondBlock);
+ Phi->addIncoming(PrevPhi, PrevIfBlock);
+ PrevPhi = Phi;
+ VResult = Phi;
+ }
+
+ Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+ ConstantInt::get(Predicate->getType(), 1));
+
+ // Create "cond" block
+ //
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %Elt = load i32* %EltAddr
+ // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+ //
+ CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
+ LoadInst* Load = Builder.CreateLoad(Gep, false);
+ VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ PrevIfBlock = IfBlock;
+ IfBlock = NewIfBlock;
+ }
+
+ Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+ Phi->addIncoming(VResult, CondBlock);
+ Phi->addIncoming(PrevPhi, PrevIfBlock);
+ Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+ CI->replaceAllUsesWith(NewI);
+ CI->eraseFromParent();
+}
+
+// ScalarizeMaskedStore() translates masked store intrinsic, like
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+// <16 x i1> %mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set
+//
+// %1 = bitcast i8* %addr to i32*
+// %2 = extractelement <16 x i1> %mask, i32 0
+// %3 = icmp eq i1 %2, true
+// br i1 %3, label %cond.store, label %else
+//
+// cond.store: ; preds = %0
+// %4 = extractelement <16 x i32> %val, i32 0
+// %5 = getelementptr i32* %1, i32 0
+// store i32 %4, i32* %5
+// br label %else
+//
+// else: ; preds = %0, %cond.store
+// %6 = extractelement <16 x i1> %mask, i32 1
+// %7 = icmp eq i1 %6, true
+// br i1 %7, label %cond.store1, label %else2
+//
+// cond.store1: ; preds = %else
+// %8 = extractelement <16 x i32> %val, i32 1
+// %9 = getelementptr i32* %1, i32 1
+// store i32 %8, i32* %9
+// br label %else2
+// . . .
+static void ScalarizeMaskedStore(CallInst *CI) {
+ Value *Ptr = CI->getArgOperand(1);
+ Value *Src = CI->getArgOperand(0);
+ Value *Mask = CI->getArgOperand(3);
+
+ VectorType *VecType = dyn_cast<VectorType>(Src->getType());
+ Type *EltTy = VecType->getElementType();
+
+ assert(VecType && "Unexpected data type in masked store intrinsic");
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ // Bitcast %addr fron i8* to EltTy*
+ Type *NewPtrType =
+ EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+ Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+
+ unsigned VectorWidth = VecType->getNumElements();
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+ // %to_store = icmp eq i1 %mask_1, true
+ // br i1 %to_load, label %cond.store, label %else
+ //
+ Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+ ConstantInt::get(Predicate->getType(), 1));
+
+ // Create "cond" block
+ //
+ // %OneElt = extractelement <16 x i32> %Src, i32 Idx
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %store i32 %OneElt, i32* %EltAddr
+ //
+ BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+ Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
+ Builder.CreateStore(OneElt, Gep);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ IfBlock = NewIfBlock;
+ }
+ CI->eraseFromParent();
+}
+
+bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
BasicBlock *BB = CI->getParent();
// Lower inline assembly if we can.
@@ -877,38 +1087,60 @@ bool CodeGenPrepare::OptimizeCallInst(Ca
return true;
}
- // Lower all uses of llvm.objectsize.*
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
- if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
- bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
- Type *ReturnTy = CI->getType();
- Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
-
- // Substituting this can cause recursive simplifications, which can
- // invalidate our iterator. Use a WeakVH to hold onto it in case this
- // happens.
- WeakVH IterHandle(CurInstIterator);
+ if (II) {
+ switch (II->getIntrinsicID()) {
+ default: break;
+ case Intrinsic::objectsize: {
+ // Lower all uses of llvm.objectsize.*
+ bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
+ Type *ReturnTy = CI->getType();
+ Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
+
+ // Substituting this can cause recursive simplifications, which can
+ // invalidate our iterator. Use a WeakVH to hold onto it in case this
+ // happens.
+ WeakVH IterHandle(CurInstIterator);
+
+ replaceAndRecursivelySimplify(CI, RetVal,
+ TLI ? TLI->getDataLayout() : nullptr,
+ TLInfo, ModifiedDT ? nullptr : DT);
- replaceAndRecursivelySimplify(CI, RetVal,
- TLI ? TLI->getDataLayout() : nullptr,
- TLInfo, ModifiedDT ? nullptr : DT);
-
- // If the iterator instruction was recursively deleted, start over at the
- // start of the block.
- if (IterHandle != CurInstIterator) {
- CurInstIterator = BB->begin();
- SunkAddrs.clear();
+ // If the iterator instruction was recursively deleted, start over at the
+ // start of the block.
+ if (IterHandle != CurInstIterator) {
+ CurInstIterator = BB->begin();
+ SunkAddrs.clear();
+ }
+ return true;
+ }
+ case Intrinsic::masked_load: {
+ // Scalarize unsupported vector masked load
+ if (!TTI->isLegalMaskedLoad(CI->getType(), 1)) {
+ ScalarizeMaskedLoad(CI);
+ ModifiedDT = true;
+ return true;
+ }
+ return false;
+ }
+ case Intrinsic::masked_store: {
+ if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(), 1)) {
+ ScalarizeMaskedStore(CI);
+ ModifiedDT = true;
+ return true;
+ }
+ return false;
+ }
}
- return true;
- }
- if (II && TLI) {
- SmallVector<Value*, 2> PtrOps;
- Type *AccessTy;
- if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
- while (!PtrOps.empty())
- if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
- return true;
+ if (TLI) {
+ SmallVector<Value*, 2> PtrOps;
+ Type *AccessTy;
+ if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
+ while (!PtrOps.empty())
+ if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
+ return true;
+ }
}
// From here on out we're working with named functions.
@@ -3801,7 +4033,7 @@ bool CodeGenPrepare::OptimizeExtractElem
return false;
}
-bool CodeGenPrepare::OptimizeInst(Instruction *I) {
+bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
if (PHINode *P = dyn_cast<PHINode>(I)) {
// It is possible for very late stage optimizations (such as SimplifyCFG)
// to introduce PHI nodes too late to be cleaned up. If we detect such a
@@ -3880,14 +4112,14 @@ bool CodeGenPrepare::OptimizeInst(Instru
GEPI->replaceAllUsesWith(NC);
GEPI->eraseFromParent();
++NumGEPsElim;
- OptimizeInst(NC);
+ OptimizeInst(NC, ModifiedDT);
return true;
}
return false;
}
if (CallInst *CI = dyn_cast<CallInst>(I))
- return OptimizeCallInst(CI);
+ return OptimizeCallInst(CI, ModifiedDT);
if (SelectInst *SI = dyn_cast<SelectInst>(I))
return OptimizeSelectInst(SI);
@@ -3904,14 +4136,16 @@ bool CodeGenPrepare::OptimizeInst(Instru
// In this pass we look for GEP and cast instructions that are used
// across basic blocks and rewrite them to improve basic-block-at-a-time
// selection.
-bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
+bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
SunkAddrs.clear();
bool MadeChange = false;
CurInstIterator = BB.begin();
- while (CurInstIterator != BB.end())
- MadeChange |= OptimizeInst(CurInstIterator++);
-
+ while (CurInstIterator != BB.end()) {
+ MadeChange |= OptimizeInst(CurInstIterator++, ModifiedDT);
+ if (ModifiedDT)
+ return true;
+ }
MadeChange |= DupRetToEnableTailCallOpts(&BB);
return MadeChange;
Modified: llvm/trunk/test/CodeGen/X86/masked_memop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_memop.ll?rev=224897&r1=224896&r2=224897&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_memop.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_memop.ll Sun Dec 28 02:54:45 2014
@@ -1,5 +1,6 @@
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
+; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=AVX_SCALAR
; AVX512-LABEL: test1
; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
@@ -9,6 +10,12 @@
; AVX2: vpmaskmovd (%rdi)
; AVX2-NOT: blend
+; AVX_SCALAR-LABEL: test1
+; AVX_SCALAR-NOT: masked
+; AVX_SCALAR: extractelement
+; AVX_SCALAR: insertelement
+; AVX_SCALAR: extractelement
+; AVX_SCALAR: insertelement
define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
%res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
@@ -31,6 +38,14 @@ define <16 x i32> @test2(<16 x i32> %tri
; AVX512-LABEL: test3
; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX_SCALAR-LABEL: test3
+; AVX_SCALAR-NOT: masked
+; AVX_SCALAR: extractelement
+; AVX_SCALAR: store
+; AVX_SCALAR: extractelement
+; AVX_SCALAR: store
+; AVX_SCALAR: extractelement
+; AVX_SCALAR: store
define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
More information about the llvm-commits
mailing list