[llvm] c4f0a08 - [InstCombine] improve demanded element analysis for vector insert-of-extract (2nd try)
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 25 08:22:31 PDT 2020
Author: Sanjay Patel
Date: 2020-08-25T11:19:36-04:00
New Revision: c4f0a0896f23d83952c3dfad9e1e539246d85cb6
URL: https://github.com/llvm/llvm-project/commit/c4f0a0896f23d83952c3dfad9e1e539246d85cb6
DIFF: https://github.com/llvm/llvm-project/commit/c4f0a0896f23d83952c3dfad9e1e539246d85cb6.diff
LOG: [InstCombine] improve demanded element analysis for vector insert-of-extract (2nd try)
The 1st attempt (rG557b890) was reverted because it caused miscompiles.
That bug is avoided here by changing the order of folds and as verified
in the new tests.
Original commit message:
InstCombine currently has odd rules for folding insert-extract chains to shuffles,
so we miss collapsing seemingly simple cases as shown in the tests here.
But poison makes this not quite as easy as we might have guessed. Alive2 tests to
show the subtle difference (similar to the regression tests):
https://alive2.llvm.org/ce/z/hp4hv3 (this is ok)
https://alive2.llvm.org/ce/z/ehEWaN (poison leakage)
SLP tends to create these patterns (as shown in the SLP tests), and this could
help with solving PR16739.
Differential Revision: https://reviews.llvm.org/D86460
Added:
Modified:
llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index aad5613e6df6..7d8d5c933bc2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1155,6 +1155,19 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
if (IdxNo < VWidth)
PreInsertDemandedElts.clearBit(IdxNo);
+ // If we only demand the element that is being inserted and that element
+ // was extracted from the same index in another vector with the same type,
+ // replace this insert with that other vector.
+ // Note: This is attempted before the call to simplifyAndSetOp because that
+ // may change UndefElts to a value that does not match with Vec.
+ Value *Vec;
+ if (PreInsertDemandedElts == 0 &&
+ match(I->getOperand(1),
+ m_ExtractElt(m_Value(Vec), m_SpecificInt(IdxNo))) &&
+ Vec->getType() == I->getType()) {
+ return Vec;
+ }
+
simplifyAndSetOp(I, 0, PreInsertDemandedElts, UndefElts);
// If this is inserting an element that isn't demanded, remove this
diff --git a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
index 48bea9a80579..3f308485342e 100644
--- a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -749,9 +749,7 @@ define <4 x i8> @select_cond_(<4 x i8> %x, <4 x i8> %min, <4 x i1> %cmp, i1 %poi
define <4 x float> @ins_of_ext(<4 x float> %x, float %y) {
; CHECK-LABEL: @ins_of_ext(
-; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
-; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[E0]], i32 0
-; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[Y:%.*]], i32 1
+; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[X:%.*]], float [[Y:%.*]], i32 1
; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[Y]], i32 2
; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[Y]], i32 3
; CHECK-NEXT: ret <4 x float> [[I3]]
@@ -766,11 +764,7 @@ define <4 x float> @ins_of_ext(<4 x float> %x, float %y) {
define <4 x float> @ins_of_ext_twice(<4 x float> %x, float %y) {
; CHECK-LABEL: @ins_of_ext_twice(
-; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
-; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[E0]], i32 0
-; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x float> [[X]], i32 1
-; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[E1]], i32 1
-; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[Y:%.*]], i32 2
+; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[X:%.*]], float [[Y:%.*]], i32 2
; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[Y]], i32 3
; CHECK-NEXT: ret <4 x float> [[I3]]
;
@@ -783,6 +777,9 @@ define <4 x float> @ins_of_ext_twice(<4 x float> %x, float %y) {
ret <4 x float> %i3
}
+; Negative test - element 3 of the result must be undef to be poison safe.
+; TODO: Could convert insert/extract to identity shuffle with undef mask elements.
+
define <4 x float> @ins_of_ext_wrong_demand(<4 x float> %x, float %y) {
; CHECK-LABEL: @ins_of_ext_wrong_demand(
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
@@ -798,6 +795,9 @@ define <4 x float> @ins_of_ext_wrong_demand(<4 x float> %x, float %y) {
ret <4 x float> %i2
}
+; Negative test - can't replace i0 with x.
+; TODO: Could convert insert/extract to identity shuffle with undef mask elements.
+
define <4 x float> @ins_of_ext_wrong_type(<5 x float> %x, float %y) {
; CHECK-LABEL: @ins_of_ext_wrong_type(
; CHECK-NEXT: [[E0:%.*]] = extractelement <5 x float> [[X:%.*]], i32 0
@@ -819,9 +819,7 @@ define <4 x float> @ins_of_ext_wrong_type(<5 x float> %x, float %y) {
define <4 x i4> @ins_of_ext_undef_elts_propagation(<4 x i4> %v, <4 x i4> %v2, i4 %x) {
; CHECK-LABEL: @ins_of_ext_undef_elts_propagation(
-; CHECK-NEXT: [[V0:%.*]] = extractelement <4 x i4> [[V:%.*]], i32 0
-; CHECK-NEXT: [[T0:%.*]] = insertelement <4 x i4> undef, i4 [[V0]], i32 0
-; CHECK-NEXT: [[T2:%.*]] = insertelement <4 x i4> [[T0]], i4 [[X:%.*]], i32 2
+; CHECK-NEXT: [[T2:%.*]] = insertelement <4 x i4> [[V:%.*]], i4 [[X:%.*]], i32 2
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i4> [[T2]], <4 x i4> [[V2:%.*]], <4 x i32> <i32 0, i32 6, i32 2, i32 7>
; CHECK-NEXT: ret <4 x i4> [[R]]
;
@@ -836,11 +834,7 @@ define <4 x i4> @ins_of_ext_undef_elts_propagation(<4 x i4> %v, <4 x i4> %v2, i4
define <8 x i4> @ins_of_ext_undef_elts_propagation2(<8 x i4> %v, <8 x i4> %v2, i4 %x) {
; CHECK-LABEL: @ins_of_ext_undef_elts_propagation2(
-; CHECK-NEXT: [[I15:%.*]] = extractelement <8 x i4> [[V:%.*]], i32 0
-; CHECK-NEXT: [[I16:%.*]] = insertelement <8 x i4> undef, i4 [[I15]], i32 0
-; CHECK-NEXT: [[I17:%.*]] = extractelement <8 x i4> [[V]], i32 1
-; CHECK-NEXT: [[I18:%.*]] = insertelement <8 x i4> [[I16]], i4 [[I17]], i32 1
-; CHECK-NEXT: [[I19:%.*]] = insertelement <8 x i4> [[I18]], i4 [[X:%.*]], i32 2
+; CHECK-NEXT: [[I19:%.*]] = insertelement <8 x i4> [[V:%.*]], i4 [[X:%.*]], i32 2
; CHECK-NEXT: [[I20:%.*]] = shufflevector <8 x i4> [[I19]], <8 x i4> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 10, i32 9, i32 8, i32 undef>
; CHECK-NEXT: [[I21:%.*]] = shufflevector <8 x i4> [[I20]], <8 x i4> [[V]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
; CHECK-NEXT: ret <8 x i4> [[I21]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index 67b6e04ee391..3dae1a7ca6d3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -51,13 +51,13 @@ define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Z:%.*]], i32 3
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP21:%.*]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP20:%.*]], i32 1
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
; CHECK: for.body:
-; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP21]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP20]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP5]], 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[T4]], i32 0
@@ -83,12 +83,11 @@ define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP17]]
; CHECK-NEXT: [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
-; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[ADD11]], i32 1
-; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
-; CHECK-NEXT: [[TMP21]] = add nsw <2 x i32> [[TMP19]], [[TMP20]]
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP22]], [[N]]
+; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[ADD11]], i32 1
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
+; CHECK-NEXT: [[TMP20]] = add nsw <2 x i32> [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP20]], i32 0
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP21]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
;
entry:
@@ -157,13 +156,13 @@ define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Z:%.*]], i32 1
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP18:%.*]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP17:%.*]], i32 1
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
; CHECK: for.body:
-; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP4]], 1
; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[T4]] to i64
@@ -188,12 +187,11 @@ define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP14]]
; CHECK-NEXT: [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[ADD11]], i32 1
-; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
-; CHECK-NEXT: [[TMP18]] = add nsw <2 x i32> [[TMP16]], [[TMP17]]
-; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP19]], [[N]]
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[ADD11]], i32 1
+; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
+; CHECK-NEXT: [[TMP17]] = add nsw <2 x i32> [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP18]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index 65f26abc87b4..70f5d105e654 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -183,13 +183,11 @@ define void @vecload_vs_broadcast5(double * noalias %from, double * noalias %to,
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[P]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void
More information about the llvm-commits
mailing list