[llvm] 9f9fdab - [SLP]Fix PR58766: deleted value used after vectorization.

Wed Nov 16 11:01:15 PST 2022

Author: Alexey Bataev
Date: 2022-11-16T10:57:03-08:00
New Revision: 9f9fdab9f1b14cde363b29965109e5493c51db52

URL: https://github.com/llvm/llvm-project/commit/9f9fdab9f1b14cde363b29965109e5493c51db52
DIFF: https://github.com/llvm/llvm-project/commit/9f9fdab9f1b14cde363b29965109e5493c51db52.diff

LOG: [SLP]Fix PR58766: deleted value used after vectorization.

If same instruction is reduced several times, but in one graph is part
of buildvector sequence and in another it is vectorized, we may loose
information that it was part of buildvector and must be extracted from
later vectorized value.

Added: 
    llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 06be11faeaced..ef63fbb4086b5 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2201,6 +2201,9 @@ class BoUpSLP {
     return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
   }
 
+  /// Check if the value is vectorized in the tree.
+  bool isVectorized(Value *V) const { return getTreeEntry(V); }
+
   ~BoUpSLP();
 
 private:
@@ -11725,6 +11728,9 @@ class HorizontalReduction {
         TrackedVals.try_emplace(V, V);
 
     DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
+    // List of the values that were reduced in other trees as part of gather
+    // nodes and thus requiring extract if fully vectorized in other trees.
+    SmallPtrSet<Value *, 4> RequiredExtract;
     Value *VectorizedTree = nullptr;
     bool CheckForReusedReductionOps = false;
     // Try to vectorize elements based on their type.
@@ -11885,6 +11891,9 @@ class HorizontalReduction {
           if (NumOps != ReducedValsToOps.find(V)->second.size())
             LocalExternallyUsedValues[V];
         }
+        for (Value *RdxVal : VL)
+          if (RequiredExtract.contains(RdxVal))
+            LocalExternallyUsedValues[RdxVal];
         V.buildExternalUses(LocalExternallyUsedValues);
 
         V.computeMinimumValueSizes();
@@ -11979,9 +11988,12 @@ class HorizontalReduction {
                                     ReducedSubTree, "op.rdx", ReductionOps);
         }
         // Count vectorized reduced values to exclude them from final reduction.
-        for (Value *V : VL)
-          ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0)
+        for (Value *RdxVal : VL) {
+          ++VectorizedVals.try_emplace(TrackedToOrig.find(RdxVal)->second, 0)
                 .first->getSecond();
+          if (!V.isVectorized(RdxVal))
+            RequiredExtract.insert(RdxVal);
+        }
         Pos += ReduxWidth;
         Start = Pos;
         ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
new file mode 100644
index 0000000000000..7e324a8edb85a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i16 @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 8
+; CHECK-NEXT:    br label [[WHILE:%.*]]
+; CHECK:       while:
+; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX12:%.*]], [[WHILE]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A2]], align 8
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[A]], align 8
+; CHECK-NEXT:    [[SHUFFLE13:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i64> poison, i64 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP4]], <16 x i64> [[TMP5]], <16 x i32> <i32 0, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[SHUFFLE]], <4 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i64> [[TMP8]], <16 x i64> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 19, i32 19, i32 19, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i64> [[TMP10]], i64 [[TMP0]], i32 9
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i64> [[TMP11]], i64 [[TMP0]], i32 10
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x i64> [[TMP12]], i64 [[TMP0]], i32 11
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[SHUFFLE13]], <4 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i64> [[TMP13]], <16 x i64> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[A1]], align 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[A3]], align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[TMP15]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i64 [[TMP19]], [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = xor i64 [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[SHUFFLE13]], i32 3
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = xor i64 [[TMP2]], [[TMP20]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = xor i64 [[TMP20]], [[TMP16]]
+; CHECK-NEXT:    [[OP_RDX4:%.*]] = xor i64 [[TMP16]], [[TMP16]]
+; CHECK-NEXT:    [[OP_RDX5:%.*]] = xor i64 [[TMP17]], [[TMP17]]
+; CHECK-NEXT:    [[OP_RDX6:%.*]] = xor i64 [[TMP18]], [[TMP18]]
+; CHECK-NEXT:    [[OP_RDX7:%.*]] = xor i64 [[OP_RDX]], [[OP_RDX1]]
+; CHECK-NEXT:    [[OP_RDX8:%.*]] = xor i64 [[OP_RDX2]], [[OP_RDX3]]
+; CHECK-NEXT:    [[OP_RDX9:%.*]] = xor i64 [[OP_RDX4]], [[OP_RDX5]]
+; CHECK-NEXT:    [[OP_RDX10:%.*]] = xor i64 [[OP_RDX7]], [[OP_RDX8]]
+; CHECK-NEXT:    [[OP_RDX11:%.*]] = xor i64 [[OP_RDX9]], [[OP_RDX6]]
+; CHECK-NEXT:    [[OP_RDX12]] = xor i64 [[OP_RDX10]], [[OP_RDX11]]
+; CHECK-NEXT:    br label [[WHILE]]
+;
+entry:
+  %a = getelementptr [1000 x i64], ptr null, i64 0, i64 5
+  %a1 = getelementptr [1000 x i64], ptr null, i64 0, i64 6
+  %a2 = getelementptr [1000 x i64], ptr null, i64 0, i64 7
+  %a3 = getelementptr [1000 x i64], ptr null, i64 0, i64 8
+  br label %while
+
+while:
+  %ph = phi i64 [ 0, %entry ], [ %xor, %while ]
+  %0 = load i64, ptr null, align 8
+  %1 = load i64, ptr %a1, align 16
+  %2 = load i64, ptr %a2, align 8
+  %3 = load i64, ptr %a3, align 16
+  %4 = load i64, ptr null, align 8
+  %5 = load i64, ptr %a, align 8
+  %6 = load i64, ptr %a1, align 16
+  %7 = load i64, ptr %a2, align 8
+  %8 = load i64, ptr %a3, align 16
+  %9 = xor i64 %0, %1
+  %10 = xor i64 %9, %2
+  %11 = xor i64 %10, %3
+  %12 = xor i64 %11, %4
+  %13 = xor i64 %12, %0
+  %14 = xor i64 %13, %1
+  %15 = xor i64 %14, %2
+  %16 = xor i64 %15, %3
+  %17 = xor i64 %16, %4
+  %18 = xor i64 %17, %0
+  %19 = xor i64 %18, %1
+  %20 = xor i64 %19, %2
+  %21 = xor i64 %20, %3
+  %22 = xor i64 %21, %4
+  %23 = xor i64 %22, %5
+  %24 = xor i64 %23, %6
+  %25 = xor i64 %24, %2
+  %26 = xor i64 %25, %3
+  %27 = xor i64 %26, %4
+  %28 = xor i64 %27, %5
+  %29 = xor i64 %28, %6
+  %30 = xor i64 %29, %7
+  %31 = xor i64 %30, %8
+  %32 = xor i64 %31, %4
+  %33 = xor i64 %32, %5
+  %34 = xor i64 %33, %6
+  %35 = xor i64 %34, %7
+  %xor = xor i64 %35, %8
+  br label %while
+}