[llvm] 81e9ede - [VectorCombine] forward walk through instructions to improve chaining of transforms

Sat May 16 10:15:53 PDT 2020

Author: Sanjay Patel
Date: 2020-05-16T13:08:01-04:00
New Revision: 81e9ede3a2db32487c15dc20d5d0be6392fb62bc

URL: https://github.com/llvm/llvm-project/commit/81e9ede3a2db32487c15dc20d5d0be6392fb62bc
DIFF: https://github.com/llvm/llvm-project/commit/81e9ede3a2db32487c15dc20d5d0be6392fb62bc.diff

LOG: [VectorCombine] forward walk through instructions to improve chaining of transforms

This is split off from D79799 - where I was proposing to fully iterate
over a function until there are no more transforms. I suspect we are
still going to want to do something like that eventually.

But we can achieve the same gains much more efficiently on the current
set of regression tests just by reversing the order that we visit the
instructions.

This may also reduce the motivation for D79078, but we are still not
getting the optimal pattern for a reduction.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
    llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
    llvm/test/Transforms/VectorCombine/X86/insert-binop.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 2cf64f0a53a4..de22665ecae9 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -381,11 +381,10 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI,
     if (!DT.isReachableFromEntry(&BB))
       continue;
     // Do not delete instructions under here and invalidate the iterator.
-    // Walk the block backwards for efficiency. We're matching a chain of
-    // use->defs, so we're more likely to succeed by starting from the bottom.
+    // Walk the block forwards to enable simple iterative chains of transforms.
     // TODO: It could be more efficient to remove dead instructions
     //       iteratively in this loop rather than waiting until the end.
-    for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
+    for (Instruction &I : BB) {
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       MadeChange |= foldExtractExtract(I, TTI);

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index f2e374cb087e..66ec8157932c 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -5,17 +5,19 @@
 target triple = "x86_64--"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
+; FIXME: This should only need 2 'or' instructions.
+
 define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
 ; CHECK-NEXT:    [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[Z]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[Z2:%.*]] = extractelement <4 x i32> [[Z]], i32 2
-; CHECK-NEXT:    [[Z012:%.*]] = or i32 [[TMP3]], [[Z2]]
-; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[Z]], i32 3
-; CHECK-NEXT:    [[Z0123:%.*]] = or i32 [[Z012]], [[Z3]]
-; CHECK-NEXT:    ret i32 [[Z0123]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
   %z = and <4 x i32> %x, %y
   %z0 = extractelement <4 x i32> %z, i32 0
@@ -32,10 +34,10 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: @ext_ext_partial_add_reduction_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[X]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT:    [[X210:%.*]] = add i32 [[TMP3]], [[X2]]
-; CHECK-NEXT:    ret i32 [[X210]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 0
+; CHECK-NEXT:    ret i32 [[TMP5]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -47,14 +49,14 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
 
 define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @ext_ext_partial_add_reduction_and_extra_add_v4i32(
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[Y]], i32 2
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[Y]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[Y210:%.*]] = add i32 [[TMP3]], [[Y1]]
-; CHECK-NEXT:    [[X2Y210:%.*]] = add i32 [[Y210]], [[Y2]]
-; CHECK-NEXT:    ret i32 [[X2Y210]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[Y]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
   %y0 = extractelement <4 x i32> %y, i32 0
   %y1 = extractelement <4 x i32> %y, i32 1

diff  --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index ef3931c3c71f..17fab6b5c3c1 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -492,12 +492,12 @@ define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-NEXT:    [[Z:%.*]] = and <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[Z]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[Z2:%.*]] = extractelement <4 x i32> [[Z]], i32 2
-; CHECK-NEXT:    [[Z012:%.*]] = or i32 [[TMP3]], [[Z2]]
-; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[Z]], i32 3
-; CHECK-NEXT:    [[Z0123:%.*]] = or i32 [[Z3]], [[Z012]]
-; CHECK-NEXT:    ret i32 [[Z0123]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i64 0
+; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
   %z = and <4 x i32> %x, %y
   %z0 = extractelement <4 x i32> %z, i32 0
@@ -514,10 +514,10 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: @ext_ext_partial_add_reduction_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[X]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT:    [[X210:%.*]] = add i32 [[X2]], [[TMP3]]
-; CHECK-NEXT:    ret i32 [[X210]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 0
+; CHECK-NEXT:    ret i32 [[TMP5]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -531,12 +531,12 @@ define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x
 ; CHECK-LABEL: @ext_ext_partial_add_reduction_and_extra_add_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[Y]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[Y]], i32 2
-; CHECK-NEXT:    [[Y210:%.*]] = add i32 [[Y2]], [[TMP3]]
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
-; CHECK-NEXT:    [[X2Y210:%.*]] = add i32 [[X2]], [[Y210]]
-; CHECK-NEXT:    ret i32 [[X2Y210]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i64 0
+; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
   %y0 = extractelement <4 x i32> %y, i32 0
   %y1 = extractelement <4 x i32> %y, i32 1

diff  --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
index 17b30862f617..abebf4d809af 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
@@ -51,11 +51,9 @@ define <2 x i64> @ins1_ins1_xor(i64 %x, i64 %y) {
 define <2 x i64> @ins1_ins1_iterate(i64 %w, i64 %x, i64 %y, i64 %z) {
 ; CHECK-LABEL: @ins1_ins1_iterate(
 ; CHECK-NEXT:    [[S0_SCALAR:%.*]] = sub i64 [[W:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[S0:%.*]] = insertelement <2 x i64> undef, i64 [[S0_SCALAR]], i64 1
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <2 x i64> undef, i64 [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[S1:%.*]] = or <2 x i64> [[S0]], [[I2]]
-; CHECK-NEXT:    [[I3:%.*]] = insertelement <2 x i64> undef, i64 [[Z:%.*]], i32 1
-; CHECK-NEXT:    [[S2:%.*]] = shl <2 x i64> [[I3]], [[S1]]
+; CHECK-NEXT:    [[S1_SCALAR:%.*]] = or i64 [[S0_SCALAR]], [[Y:%.*]]
+; CHECK-NEXT:    [[S2_SCALAR:%.*]] = shl i64 [[Z:%.*]], [[S1_SCALAR]]
+; CHECK-NEXT:    [[S2:%.*]] = insertelement <2 x i64> undef, i64 [[S2_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[S2]]
 ;
   %i0 = insertelement <2 x i64> undef, i64 %w, i64 1