[llvm] r375395 - [InstCombine] Allow values with multiple users in SimplifyDemandedVectorElts
Piotr Sobczak via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 21 01:12:48 PDT 2019
Author: piotr
Date: Mon Oct 21 01:12:47 2019
New Revision: 375395
URL: http://llvm.org/viewvc/llvm-project?rev=375395&view=rev
Log:
[InstCombine] Allow values with multiple users in SimplifyDemandedVectorElts
Summary:
Allow for ignoring the check for a single use in SimplifyDemandedVectorElts
to be able to simplify operands if DemandedElts is known to contain
the union of elements used by all users.
It is a responsibility of a caller of SimplifyDemandedVectorElts to
supply correct DemandedElts.
Simplify a series of extractelement instructions if only a subset of
elements is used.
Reviewers: reames, arsenm, majnemer, nhaehnle
Reviewed By: nhaehnle
Subscribers: wdng, jvesely, nhaehnle, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67345
Modified:
llvm/trunk/lib/Transforms/InstCombine/InstCombineInternal.h
llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineInternal.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineInternal.h?rev=375395&r1=375394&r2=375395&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineInternal.h (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineInternal.h Mon Oct 21 01:12:47 2019
@@ -858,7 +858,8 @@ private:
int DmaskIdx = -1);
Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
- APInt &UndefElts, unsigned Depth = 0);
+ APInt &UndefElts, unsigned Depth = 0,
+ bool AllowMultipleUsers = false);
/// Canonicalize the position of binops relative to shufflevector.
Instruction *foldVectorBinop(BinaryOperator &Inst);
Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp?rev=375395&r1=375394&r2=375395&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp Mon Oct 21 01:12:47 2019
@@ -1074,16 +1074,22 @@ Value *InstCombiner::simplifyAMDGCNMemor
}
/// The specified value produces a vector with any number of elements.
+/// This method analyzes which elements of the operand are undef and returns
+/// that information in UndefElts.
+///
/// DemandedElts contains the set of elements that are actually used by the
-/// caller. This method analyzes which elements of the operand are undef and
-/// returns that information in UndefElts.
+/// caller, and by default (AllowMultipleUsers equals false) the value is
+/// simplified only if it has a single caller. If AllowMultipleUsers is set
+/// to true, DemandedElts refers to the union of sets of elements that are
+/// used by all callers.
///
/// If the information about demanded elements can be used to simplify the
/// operation, the operation is simplified, then the resultant value is
/// returned. This returns null if no change was made.
Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
APInt &UndefElts,
- unsigned Depth) {
+ unsigned Depth,
+ bool AllowMultipleUsers) {
unsigned VWidth = V->getType()->getVectorNumElements();
APInt EltMask(APInt::getAllOnesValue(VWidth));
assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
@@ -1137,19 +1143,21 @@ Value *InstCombiner::SimplifyDemandedVec
if (Depth == 10)
return nullptr;
- // If multiple users are using the root value, proceed with
- // simplification conservatively assuming that all elements
- // are needed.
- if (!V->hasOneUse()) {
- // Quit if we find multiple users of a non-root value though.
- // They'll be handled when it's their turn to be visited by
- // the main instcombine process.
- if (Depth != 0)
- // TODO: Just compute the UndefElts information recursively.
- return nullptr;
+ if (!AllowMultipleUsers) {
+ // If multiple users are using the root value, proceed with
+ // simplification conservatively assuming that all elements
+ // are needed.
+ if (!V->hasOneUse()) {
+ // Quit if we find multiple users of a non-root value though.
+ // They'll be handled when it's their turn to be visited by
+ // the main instcombine process.
+ if (Depth != 0)
+ // TODO: Just compute the UndefElts information recursively.
+ return nullptr;
- // Conservatively assume that all elements are needed.
- DemandedElts = EltMask;
+ // Conservatively assume that all elements are needed.
+ DemandedElts = EltMask;
+ }
}
Instruction *I = dyn_cast<Instruction>(V);
Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp?rev=375395&r1=375394&r2=375395&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp Mon Oct 21 01:12:47 2019
@@ -253,6 +253,69 @@ static Instruction *foldBitcastExtElt(Ex
return nullptr;
}
+/// Find elements of V demanded by UserInstr.
+static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
+ unsigned VWidth = V->getType()->getVectorNumElements();
+
+ // Conservatively assume that all elements are needed.
+ APInt UsedElts(APInt::getAllOnesValue(VWidth));
+
+ switch (UserInstr->getOpcode()) {
+ case Instruction::ExtractElement: {
+ ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr);
+ assert(EEI->getVectorOperand() == V);
+ ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand());
+ if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) {
+ UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue());
+ }
+ break;
+ }
+ case Instruction::ShuffleVector: {
+ ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr);
+ unsigned MaskNumElts = UserInstr->getType()->getVectorNumElements();
+
+ UsedElts = APInt(VWidth, 0);
+ for (unsigned i = 0; i < MaskNumElts; i++) {
+ unsigned MaskVal = Shuffle->getMaskValue(i);
+ if (MaskVal == -1u || MaskVal >= 2 * VWidth)
+ continue;
+ if (Shuffle->getOperand(0) == V && (MaskVal < VWidth))
+ UsedElts.setBit(MaskVal);
+ if (Shuffle->getOperand(1) == V &&
+ ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth)))
+ UsedElts.setBit(MaskVal - VWidth);
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ return UsedElts;
+}
+
+/// Find union of elements of V demanded by all its users.
+/// If it is known by querying findDemandedEltsBySingleUser that
+/// no user demands an element of V, then the corresponding bit
+/// remains unset in the returned value.
+static APInt findDemandedEltsByAllUsers(Value *V) {
+ unsigned VWidth = V->getType()->getVectorNumElements();
+
+ APInt UnionUsedElts(VWidth, 0);
+ for (const Use &U : V->uses()) {
+ if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
+ UnionUsedElts |= findDemandedEltsBySingleUser(V, I);
+ } else {
+ UnionUsedElts = APInt::getAllOnesValue(VWidth);
+ break;
+ }
+
+ if (UnionUsedElts.isAllOnesValue())
+ break;
+ }
+
+ return UnionUsedElts;
+}
+
Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
Value *SrcVec = EI.getVectorOperand();
Value *Index = EI.getIndexOperand();
@@ -271,19 +334,35 @@ Instruction *InstCombiner::visitExtractE
return nullptr;
// This instruction only demands the single element from the input vector.
- // If the input vector has a single use, simplify it based on this use
- // property.
- if (SrcVec->hasOneUse() && NumElts != 1) {
- APInt UndefElts(NumElts, 0);
- APInt DemandedElts(NumElts, 0);
- DemandedElts.setBit(IndexC->getZExtValue());
- if (Value *V = SimplifyDemandedVectorElts(SrcVec, DemandedElts,
- UndefElts)) {
- EI.setOperand(0, V);
- return &EI;
+ if (NumElts != 1) {
+ // If the input vector has a single use, simplify it based on this use
+ // property.
+ if (SrcVec->hasOneUse()) {
+ APInt UndefElts(NumElts, 0);
+ APInt DemandedElts(NumElts, 0);
+ DemandedElts.setBit(IndexC->getZExtValue());
+ if (Value *V =
+ SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) {
+ EI.setOperand(0, V);
+ return &EI;
+ }
+ } else {
+ // If the input vector has multiple uses, simplify it based on a union
+ // of all elements used.
+ APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
+ if (!DemandedElts.isAllOnesValue()) {
+ APInt UndefElts(NumElts, 0);
+ if (Value *V = SimplifyDemandedVectorElts(
+ SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
+ true /* AllowMultipleUsers */)) {
+ if (V != SrcVec) {
+ SrcVec->replaceAllUsesWith(V);
+ return &EI;
+ }
+ }
+ }
}
}
-
if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian()))
return I;
Modified: llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll?rev=375395&r1=375394&r2=375395&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll (original)
+++ llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll Mon Oct 21 01:12:47 2019
@@ -152,11 +152,10 @@ define amdgpu_ps <3 x float> @extract_el
ret <3 x float> %shuf
}
-; FIXME: Not handled even though only 2 elts used
; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt0 = extractelement <4 x float> %data, i32 0
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 1
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt0 = extractelement <2 x float> %data, i32 0
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
; CHECK-NEXT: %ins0 = insertvalue { float, float } undef, float %elt0, 0
; CHECK-NEXT: %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
; CHECK-NEXT: ret { float, float } %ins1
@@ -169,6 +168,74 @@ define amdgpu_ps { float, float } @extra
ret { float, float } %ins1
}
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt0 = extractelement <3 x float> %data, i32 0
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 1
+; CHECK-NEXT: %elt2 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: %ins0 = insertvalue { float, float, float } undef, float %elt0, 0
+; CHECK-NEXT: %ins1 = insertvalue { float, float, float } %ins0, float %elt1, 1
+; CHECK-NEXT: %ins2 = insertvalue { float, float, float } %ins1, float %elt2, 2
+; CHECK-NEXT: ret { float, float, float } %ins2
+define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+ %elt0 = extractelement <4 x float> %data, i32 0
+ %elt1 = extractelement <4 x float> %data, i32 1
+ %elt2 = extractelement <4 x float> %data, i32 2
+ %ins0 = insertvalue { float, float, float } undef, float %elt0, 0
+ %ins1 = insertvalue { float, float, float } %ins0, float %elt1, 1
+ %ins2 = insertvalue { float, float, float } %ins1, float %elt2, 2
+ ret { float, float, float } %ins2
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 undef, i32 1>
+; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf
+define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+ %elt0 = extractelement <4 x float> %data, i32 0
+ %elt2 = extractelement <4 x float> %data, i32 2
+ %ins0 = insertelement <2 x float> undef, float %elt0, i32 0
+ %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
+ %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 4, i32 1>
+ %ret = fadd <2 x float> %ins1, %shuf
+ ret <2 x float> %ret
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf
+; CHECK-NEXT: ret <2 x float> %ret
+define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+ %elt0 = extractelement <4 x float> %data, i32 0
+ %elt2 = extractelement <4 x float> %data, i32 2
+ %ins0 = insertelement <2 x float> undef, float %elt0, i32 0
+ %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
+ %shuf = shufflevector <4 x float> undef, <4 x float> %data, <2 x i32> <i32 5, i32 1>
+ %ret = fadd <2 x float> %ins1, %shuf
+ ret <2 x float> %ret
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 2, i32 2>
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf
+define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+ %elt2 = extractelement <4 x float> %data, i32 2
+ %ins0 = insertelement <2 x float> undef, float %elt2, i32 0
+ %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
+ %shuf = shufflevector <4 x float> %data, <4 x float> %data, <2 x i32> <i32 0, i32 5>
+ %ret = fadd <2 x float> %ins1, %shuf
+ ret <2 x float> %ret
+}
+
; CHECK-LABEL: @extract_elt0_buffer_load_v3f32(
; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
; CHECK-NEXT: ret float %data
More information about the llvm-commits
mailing list