[llvm] r322579 - [SLP] Fix for PR32164: Improve vectorization of reverse order of extract operations.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 16 10:17:02 PST 2018
Author: abataev
Date: Tue Jan 16 10:17:01 2018
New Revision: 322579
URL: http://llvm.org/viewvc/llvm-project?rev=322579&view=rev
Log:
[SLP] Fix for PR32164: Improve vectorization of reverse order of extract operations.
Summary: Sometimes vectorization of insertelement instructions with extractelement operands may produce an extra shuffle operation, if these operands are in the reverse order. Patch tries to improve this situation by the reordering of the operands to remove this extra shuffle operation.
Reviewers: mkuper, hfinkel, RKSimon, spatel
Subscribers: mzolotukhin, llvm-commits
Differential Revision: https://reviews.llvm.org/D33954
Modified:
llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/trunk/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=322579&r1=322578&r2=322579&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Tue Jan 16 10:17:01 2018
@@ -585,8 +585,7 @@ public:
ScalarToTreeEntry.clear();
MustGather.clear();
ExternalUses.clear();
- NumLoadsWantToKeepOrder = 0;
- NumLoadsWantToChangeOrder = 0;
+ NumOpsWantToKeepOrder.clear();
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
@@ -601,7 +600,12 @@ public:
/// \returns true if it is beneficial to reverse the vector order.
bool shouldReorder() const {
- return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
+ return std::accumulate(
+ NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), 0,
+ [](int Val1,
+ const decltype(NumOpsWantToKeepOrder)::value_type &Val2) {
+ return Val1 + (Val2.second < 0 ? 1 : -1);
+ }) > 0;
}
/// \return The vector element size in bits to use when vectorizing the
@@ -1201,11 +1205,10 @@ private:
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
- // Number of load bundles that contain consecutive loads.
- int NumLoadsWantToKeepOrder = 0;
-
- // Number of load bundles that contain consecutive loads in reversed order.
- int NumLoadsWantToChangeOrder = 0;
+ /// Number of operation bundles that contain consecutive operations - number
+ /// of operation bundles that contain consecutive operations in reversed
+ /// order.
+ DenseMap<unsigned, int> NumOpsWantToKeepOrder;
// Analysis and block reference.
Function *F;
@@ -1543,7 +1546,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
bool Reuse = canReuseExtract(VL, VL0);
if (Reuse) {
DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
+ ++NumOpsWantToKeepOrder[S.Opcode];
} else {
+ SmallVector<Value *, 4> ReverseVL(VL.rbegin(), VL.rend());
+ if (canReuseExtract(ReverseVL, VL0))
+ --NumOpsWantToKeepOrder[S.Opcode];
BS.cancelScheduling(VL, VL0);
}
newTreeEntry(VL, Reuse, UserTreeIdx);
@@ -1593,7 +1600,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
}
if (Consecutive) {
- ++NumLoadsWantToKeepOrder;
+ ++NumOpsWantToKeepOrder[S.Opcode];
newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of loads.\n");
return;
@@ -1612,7 +1619,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
newTreeEntry(VL, false, UserTreeIdx);
if (ReverseConsecutive) {
- ++NumLoadsWantToChangeOrder;
+ --NumOpsWantToKeepOrder[S.Opcode];
DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
} else {
DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll?rev=322579&r1=322578&r2=322579&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll Tue Jan 16 10:17:01 2018
@@ -5,13 +5,12 @@ define float @dotf(<4 x float> %x, <4 x
; CHECK-LABEL: @dotf(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = fmul fast <4 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP0]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; CHECK-NEXT: ret float [[TMP2]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT: ret float [[TMP1]]
;
entry:
%vecext = extractelement <4 x float> %x, i32 0
@@ -38,13 +37,12 @@ define double @dotd(<4 x double>* byval
; CHECK-NEXT: [[X:%.*]] = load <4 x double>, <4 x double>* [[TMP0:%.*]], align 32
; CHECK-NEXT: [[Y:%.*]] = load <4 x double>, <4 x double>* [[TMP1:%.*]], align 32
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> [[X]], [[Y]]
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP2]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
-; CHECK-NEXT: ret double [[TMP4]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; CHECK-NEXT: ret double [[TMP3]]
;
entry:
%x = load <4 x double>, <4 x double>* %0, align 32
@@ -73,13 +71,12 @@ define float @dotfq(<4 x float>* nocaptu
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[Y:%.*]], align 16
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP2]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; CHECK-NEXT: ret float [[TMP4]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT: ret float [[TMP3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %x, align 16
@@ -108,13 +105,12 @@ define double @dotdq(<4 x double>* nocap
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, <4 x double>* [[X:%.*]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[Y:%.*]], align 32
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP2]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
-; CHECK-NEXT: ret double [[TMP4]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; CHECK-NEXT: ret double [[TMP3]]
;
entry:
%0 = load <4 x double>, <4 x double>* %x, align 32
More information about the llvm-commits
mailing list